- Adding HoodieCombineHiveInputFormat for COW tables (#811)

- Combine input format helps to reduce large scans into smaller ones by combining map tasks - Implementation to support Hive 2.x and above
2019-08-03 08:44:01 -07:00
parent 1a29d46a57
commit a066865bd6
1 changed files with 933 additions and 0 deletions
--- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/hive/HoodieCombineHiveInputFormat.java
+++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/hive/HoodieCombineHiveInputFormat.java
@@ -0,0 +1,933 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package com.uber.hoodie.hadoop.hive;
 import com.google.common.annotations.VisibleForTesting;
 import com.uber.hoodie.hadoop.HoodieInputFormat;
 import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hive.common.StringInternUtils;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.io.CombineHiveRecordReader;
 import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
 import org.apache.hadoop.hive.ql.io.HiveInputFormat;
 import org.apache.hadoop.hive.ql.io.IOPrepareCache;
 import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat;
 import org.apache.hadoop.hive.ql.log.PerfLogger;
 import org.apache.hadoop.hive.ql.parse.SplitSample;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim;
 import org.apache.hadoop.hive.shims.HadoopShimsSecure;
 import org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim;
 import org.apache.hadoop.hive.shims.ShimLoader;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.lib.CombineFileInputFormat;
 import org.apache.hadoop.mapred.lib.CombineFileSplit;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 /**
 * This is just a copy of the org.apache.hadoop.hive.ql.io.CombineHiveInputFormat from Hive 2.x
 * Search for **MOD** to see minor modifications to support custom inputformat in CombineHiveInputFormat.
 * See https://issues.apache.org/jira/browse/HIVE-9771
 * <p>
 * <p>
 * CombineHiveInputFormat is a parameterized InputFormat which looks at the path
 * name and determine the correct InputFormat for that path name from
 * mapredPlan.pathToPartitionInfo(). It can be used to read files with different
 * input format in the same map-reduce job.
 *
 * NOTE : This class is implemented to work with Hive 2.x +
 */
 public class HoodieCombineHiveInputFormat<K extends WritableComparable, V extends Writable>
    extends HiveInputFormat<K, V> {
  private static final String CLASS_NAME = HoodieCombineHiveInputFormat.class.getName();
  public static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);
  // max number of threads we can use to check non-combinable paths
  private static final int MAX_CHECK_NONCOMBINABLE_THREAD_NUM = 50;
  private static final int DEFAULT_NUM_PATH_PER_THREAD = 100;
  private class CheckNonCombinablePathCallable implements Callable<Set<Integer>> {
    private final Path[] paths;
    private final int start;
    private final int length;
    private final JobConf conf;
    public CheckNonCombinablePathCallable(Path[] paths, int start, int length, JobConf conf) {
      this.paths = paths;
      this.start = start;
      this.length = length;
      this.conf = conf;
    }
    @Override
    public Set<Integer> call() throws Exception {
      Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
      for (int i = 0; i < length; i++) {
        PartitionDesc part =
            HiveFileFormatUtils.getPartitionDescFromPathRecursively(
                pathToPartitionInfo, paths[i + start],
                IOPrepareCache.get().allocatePartitionDescMap());
        // Use HiveInputFormat if any of the paths is not splittable
        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
        InputFormat<WritableComparable, Writable> inputFormat =
            getInputFormatFromCache(inputFormatClass, conf);
        if (inputFormat instanceof AvoidSplitCombination
            && ((AvoidSplitCombination) inputFormat).shouldSkipCombine(paths[i + start], conf)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("The path [" + paths[i + start]
                + "] is being parked for HiveInputFormat.getSplits");
          }
          nonCombinablePathIndices.add(i + start);
        }
      }
      return nonCombinablePathIndices;
    }
  }
  /**
   * CombineHiveInputSplit encapsulates an InputSplit with its corresponding
   * inputFormatClassName. A CombineHiveInputSplit comprises of multiple chunks
   * from different files. Since, they belong to a single directory, there is a
   * single inputformat for all the chunks.
   */
  public static class CombineHiveInputSplit extends InputSplitShim {
    private String inputFormatClassName;
    private CombineFileSplit inputSplitShim;
    private Map<Path, PartitionDesc> pathToPartitionInfo;
    public CombineHiveInputSplit() throws IOException {
      this(ShimLoader.getHadoopShims().getCombineFileInputFormat()
          .getInputSplitShim());
    }
    public CombineHiveInputSplit(CombineFileSplit inputSplitShim) throws IOException {
      this(inputSplitShim.getJob(), inputSplitShim);
    }
    public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim)
        throws IOException {
      this(job, inputSplitShim, null);
    }
    public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim,
        Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
      this.inputSplitShim = inputSplitShim;
      this.pathToPartitionInfo = pathToPartitionInfo;
      if (job != null) {
        if (this.pathToPartitionInfo == null) {
          this.pathToPartitionInfo = Utilities.getMapWork(job).getPathToPartitionInfo();
        }
        // extract all the inputFormatClass names for each chunk in the
        // CombinedSplit.
        Path[] ipaths = inputSplitShim.getPaths();
        if (ipaths.length > 0) {
          PartitionDesc part = HiveFileFormatUtils
              .getPartitionDescFromPathRecursively(this.pathToPartitionInfo,
                  ipaths[0], IOPrepareCache.get().getPartitionDescMap());
          inputFormatClassName = part.getInputFileFormatClass().getName();
        }
      }
    }
    public CombineFileSplit getInputSplitShim() {
      return inputSplitShim;
    }
    /**
     * Returns the inputFormat class name for the i-th chunk.
     */
    public String inputFormatClassName() {
      return inputFormatClassName;
    }
    public void setInputFormatClassName(String inputFormatClassName) {
      this.inputFormatClassName = inputFormatClassName;
    }
    @Override
    public JobConf getJob() {
      return inputSplitShim.getJob();
    }
    @Override
    public long getLength() {
      return inputSplitShim.getLength();
    }
    /**
     * Returns an array containing the startoffsets of the files in the split.
     */
    @Override
    public long[] getStartOffsets() {
      return inputSplitShim.getStartOffsets();
    }
    /**
     * Returns an array containing the lengths of the files in the split.
     */
    @Override
    public long[] getLengths() {
      return inputSplitShim.getLengths();
    }
    /**
     * Returns the start offset of the i<sup>th</sup> Path.
     */
    @Override
    public long getOffset(int i) {
      return inputSplitShim.getOffset(i);
    }
    /**
     * Returns the length of the i<sup>th</sup> Path.
     */
    @Override
    public long getLength(int i) {
      return inputSplitShim.getLength(i);
    }
    /**
     * Returns the number of Paths in the split.
     */
    @Override
    public int getNumPaths() {
      return inputSplitShim.getNumPaths();
    }
    /**
     * Returns the i<sup>th</sup> Path.
     */
    @Override
    public Path getPath(int i) {
      return inputSplitShim.getPath(i);
    }
    /**
     * Returns all the Paths in the split.
     */
    @Override
    public Path[] getPaths() {
      return inputSplitShim.getPaths();
    }
    /**
     * Returns all the Paths where this input-split resides.
     */
    @Override
    public String[] getLocations() throws IOException {
      return inputSplitShim.getLocations();
    }
    /**
     * Prints this obejct as a string.
     */
    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append(inputSplitShim.toString());
      sb.append("InputFormatClass: " + inputFormatClassName);
      sb.append("\n");
      return sb.toString();
    }
    /**
     * Writable interface.
     */
    @Override
    public void readFields(DataInput in) throws IOException {
      inputSplitShim.readFields(in);
      inputFormatClassName = in.readUTF();
    }
    /**
     * Writable interface.
     */
    @Override
    public void write(DataOutput out) throws IOException {
      inputSplitShim.write(out);
      if (inputFormatClassName == null) {
        if (pathToPartitionInfo == null) {
          pathToPartitionInfo = Utilities.getMapWork(getJob()).getPathToPartitionInfo();
        }
        // extract all the inputFormatClass names for each chunk in the
        // CombinedSplit.
        PartitionDesc part =
            HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo,
                inputSplitShim.getPath(0), IOPrepareCache.get().getPartitionDescMap());
        // create a new InputFormat instance if this is the first time to see
        // this class
        inputFormatClassName = part.getInputFileFormatClass().getName();
      }
      out.writeUTF(inputFormatClassName);
    }
  }
  // Splits are not shared across different partitions with different input formats.
  // For example, 2 partitions (1 sequencefile and 1 rcfile) will have 2 different splits
  private static class CombinePathInputFormat {
    private final List<Operator<? extends OperatorDesc>> opList;
    private final String inputFormatClassName;
    private final String deserializerClassName;
    public CombinePathInputFormat(List<Operator<? extends OperatorDesc>> opList,
        String inputFormatClassName,
        String deserializerClassName) {
      this.opList = opList;
      this.inputFormatClassName = inputFormatClassName;
      this.deserializerClassName = deserializerClassName;
    }
    @Override
    public boolean equals(Object o) {
      if (o instanceof CombinePathInputFormat) {
        CombinePathInputFormat mObj = (CombinePathInputFormat) o;
        return (opList.equals(mObj.opList))
            && (inputFormatClassName.equals(mObj.inputFormatClassName))
            && (deserializerClassName == null ? (mObj.deserializerClassName == null) :
            deserializerClassName.equals(mObj.deserializerClassName));
      }
      return false;
    }
    @Override
    public int hashCode() {
      return (opList == null) ? 0 : opList.hashCode();
    }
  }
  /**
   * Create Hive splits based on CombineFileSplit.
   */
  private InputSplit[] getCombineSplits(JobConf job, int numSplits,
      Map<Path, PartitionDesc> pathToPartitionInfo)
      throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork =
        mrwork.getAliasToWork();
    /** MOD - Initialize a custom combine input format shim that will call listStatus on the custom inputFormat **/
    HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim
        combine = new HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim();
    InputSplit[] splits = null;
    if (combine == null) {
      splits = super.getSplits(job, numSplits);
      return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
      throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap =
        new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    for (Path path : paths) {
      PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(
          pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
      TableDesc tableDesc = part.getTableDesc();
      if ((tableDesc != null) && tableDesc.isNonNative()) {
        return super.getSplits(job, numSplits);
      }
      // Use HiveInputFormat if any of the paths is not splittable
      Class inputFormatClass = part.getInputFileFormatClass();
      String inputFormatClassName = inputFormatClass.getName();
      InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
      LOG.info("Input Format => " + inputFormatClass.getName());
      // **MOD** Set the hoodie filter in the combine
      if (inputFormatClass.getName().equals(HoodieInputFormat.class.getName())) {
        combine.setHoodieFilter(true);
      } else if (inputFormatClass.getName().equals(HoodieRealtimeInputFormat.class.getName())) {
        LOG.info("Setting hoodie filter and realtime input format");
        combine.setHoodieFilter(true);
        combine.setRealTime(true);
      }
      String deserializerClassName = null;
      try {
        deserializerClassName = part.getDeserializer(job).getClass().getName();
      } catch (Exception e) {
        // ignore
      }
      FileSystem inpFs = path.getFileSystem(job);
      //don't combine if inputformat is a SymlinkTextInputFormat
      if (inputFormat instanceof SymlinkTextInputFormat) {
        splits = super.getSplits(job, numSplits);
        return splits;
      }
      Path filterPath = path;
      // Does a pool exist for this path already
      CombineFilter f = null;
      List<Operator<? extends OperatorDesc>> opList = null;
      if (!mrwork.isMapperCannotSpanPartns()) {
        //if mapper can span partitions, make sure a splits does not contain multiple
        // opList + inputFormatClassName + deserializerClassName combination
        // This is done using the Map of CombinePathInputFormat to PathFilter
        opList = HiveFileFormatUtils.doGetWorksFromPath(
            pathToAliases, aliasToWork, filterPath);
        CombinePathInputFormat combinePathInputFormat =
            new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
        f = poolMap.get(combinePathInputFormat);
        if (f == null) {
          f = new CombineFilter(filterPath);
          LOG.info("CombineHiveInputSplit creating pool for "
              + path + "; using filter path " + filterPath);
          combine.createPool(job, f);
          poolMap.put(combinePathInputFormat, f);
        } else {
          LOG.info("CombineHiveInputSplit: pool is already created for "
              + path + "; using filter path " + filterPath);
          f.addPath(filterPath);
        }
      } else {
        // In the case of tablesample, the input paths are pointing to files rather than directories.
        // We need to get the parent directory as the filtering path so that all files in the same
        // parent directory will be grouped into one pool but not files from different parent
        // directories. This guarantees that a split will combine all files in the same partition
        // but won't cross multiple partitions if the user has asked so.
        if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
          filterPath = path.getParent();
          inpFiles.add(path);
          poolSet.add(filterPath);
        } else {
          inpDirs.add(path);
        }
      }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
      //mapper can span partitions
      //combine into as few as one split, subject to the PathFilters set
      // using combine.createPool.
      iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
      for (Path path : inpDirs) {
        processPaths(job, combine, iss, path);
      }
      if (inpFiles.size() > 0) {
        // Processing files
        for (Path filterPath : poolSet) {
          combine.createPool(job, new CombineFilter(filterPath));
        }
        processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
      }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
      iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
      CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
      result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new CombineHiveInputSplit[result.size()]);
  }
  /**
   * Gets all the path indices that should not be combined
   */
  @VisibleForTesting
  public Set<Integer> getNonCombinablePathIndices(JobConf job, Path[] paths, int numThreads)
      throws ExecutionException, InterruptedException {
    LOG.info("Total number of paths: " + paths.length
        + ", launching " + numThreads + " threads to check non-combinable ones.");
    int numPathPerThread = (int) Math.ceil((double) paths.length / numThreads);
    ExecutorService executor = Executors.newFixedThreadPool(numThreads);
    List<Future<Set<Integer>>> futureList = new ArrayList<Future<Set<Integer>>>(numThreads);
    try {
      for (int i = 0; i < numThreads; i++) {
        int start = i * numPathPerThread;
        int length = i != numThreads - 1 ? numPathPerThread : paths.length - start;
        futureList.add(executor.submit(
            new CheckNonCombinablePathCallable(paths, start, length, job)));
      }
      Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
      for (Future<Set<Integer>> future : futureList) {
        nonCombinablePathIndices.addAll(future.get());
      }
      return nonCombinablePathIndices;
    } finally {
      executor.shutdownNow();
    }
  }
  /**
   * Create Hive splits based on CombineFileSplit.
   */
  @Override
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    Path[] paths = getInputPaths(job);
    List<Path> nonCombinablePaths = new ArrayList<Path>(paths.length / 2);
    List<Path> combinablePaths = new ArrayList<Path>(paths.length / 2);
    int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM,
        (int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
    // This check is necessary because for Spark branch, the result array from
    // getInputPaths() above could be empty, and therefore numThreads could be 0.
    // In that case, Executors.newFixedThreadPool will fail.
    if (numThreads > 0) {
      try {
        Set<Integer> nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
        for (int i = 0; i < paths.length; i++) {
          if (nonCombinablePathIndices.contains(i)) {
            nonCombinablePaths.add(paths[i]);
          } else {
            combinablePaths.add(paths[i]);
          }
        }
      } catch (Exception e) {
        LOG.error("Error checking non-combinable path", e);
        perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
        throw new IOException(e);
      }
    }
    // Store the previous value for the path specification
    String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    if (LOG.isDebugEnabled()) {
      LOG.debug("The received input paths are: [" + oldPaths
          + "] against the property "
          + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
    }
    // Process the normal splits
    if (nonCombinablePaths.size() > 0) {
      FileInputFormat.setInputPaths(job,
          nonCombinablePaths.toArray(new Path[nonCombinablePaths.size()]));
      InputSplit[] splits = super.getSplits(job, numSplits);
      for (InputSplit split : splits) {
        result.add(split);
      }
    }
    // Process the combine splits
    if (combinablePaths.size() > 0) {
      FileInputFormat.setInputPaths(job,
          combinablePaths.toArray(new Path[combinablePaths.size()]));
      Map<Path, PartitionDesc> pathToPartitionInfo = this.pathToPartitionInfo != null
          ? this.pathToPartitionInfo : Utilities.getMapWork(job).getPathToPartitionInfo();
      InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
      for (InputSplit split : splits) {
        result.add(split);
      }
    }
    // Restore the old path information back
    // This is just to prevent incompatibilities with previous versions Hive
    // if some application depends on the original value being set.
    if (oldPaths != null) {
      job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
    }
    // clear work from ThreadLocal after splits generated in case of thread is reused in pool.
    Utilities.clearWorkMapForConf(job);
    LOG.info("Number of all splits " + result.size());
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new InputSplit[result.size()]);
  }
  private void processPaths(JobConf job, CombineFileInputFormatShim combine,
      List<CombineFileSplit> iss, Path... path) throws IOException {
    JobConf currJob = new JobConf(job);
    FileInputFormat.setInputPaths(currJob, path);
    iss.addAll(Arrays.asList(combine.getSplits(currJob, 1)));
  }
  /**
   * MOD - Just added this for visibility
   **/
  Path[] getInputPaths(JobConf job) throws IOException {
    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
      // on tez we're avoiding to duplicate the file info in FileInputFormat.
      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        try {
          List<Path> paths = Utilities.getInputPathsTez(job, mrwork);
          dirs = paths.toArray(new Path[paths.size()]);
        } catch (Exception e) {
          throw new IOException("Could not create input files", e);
        }
      } else {
        throw new IOException("No input paths specified in job");
      }
    }
    return dirs;
  }
  /**
   * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
   * <p>
   * First, splits are grouped by alias they are for. If one split serves more than one
   * alias or not for any sampled alias, we just directly add it to returned list.
   * Then we find a list of exclusive splits for every alias to be sampled.
   * For each alias, we start from position of seedNumber%totalNumber, and keep add
   * splits until the total size hits percentage.
   *
   * @return the sampled splits
   */
  private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
    HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
    List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
    Map<String, ArrayList<CombineFileSplit>> aliasToSplitList =
        new HashMap<String, ArrayList<CombineFileSplit>>();
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
    // Populate list of exclusive splits for every sampled alias
    //
    for (CombineFileSplit split : splits) {
      String alias = null;
      for (Path path : split.getPaths()) {
        boolean schemeless = path.toUri().getScheme() == null;
        List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(
            schemeless ? pathToAliasesNoScheme : pathToAliases, path);
        // a path for a split unqualified the split from being sampled if:
        // 1. it serves more than one alias
        // 2. the alias it serves is not sampled
        // 3. it serves different alias than another path for the same split
        if (l.size() != 1 || !nameToSamples.containsKey(l.get(0))
            || (alias != null && l.get(0) != alias)) {
          alias = null;
          break;
        }
        alias = l.get(0);
      }
      if (alias != null) {
        // split exclusively serves alias, which needs to be sampled
        // add it to the split list of the alias.
        if (!aliasToSplitList.containsKey(alias)) {
          aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
        }
        aliasToSplitList.get(alias).add(split);
      } else {
        // The split doesn't exclusively serve one alias
        retLists.add(split);
      }
    }
    // for every sampled alias, we figure out splits to be sampled and add
    // them to return list
    //
    for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
      ArrayList<CombineFileSplit> splitList = entry.getValue();
      long totalSize = 0;
      for (CombineFileSplit split : splitList) {
        totalSize += split.getLength();
      }
      SplitSample splitSample = nameToSamples.get(entry.getKey());
      long targetSize = splitSample.getTargetSize(totalSize);
      int startIndex = splitSample.getSeedNum() % splitList.size();
      long size = 0;
      for (int i = 0; i < splitList.size(); i++) {
        CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
        retLists.add(split);
        long splitgLength = split.getLength();
        if (size + splitgLength >= targetSize) {
          LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
          if (size + splitgLength > targetSize) {
            ((InputSplitShim) split).shrinkSplit(targetSize - size);
          }
          break;
        }
        size += splitgLength;
      }
    }
    return retLists;
  }
  Map<Path, ArrayList<String>> removeScheme(Map<Path, ArrayList<String>> pathToAliases) {
    Map<Path, ArrayList<String>> result = new HashMap<>();
    for (Map.Entry<Path, ArrayList<String>> entry : pathToAliases.entrySet()) {
      Path newKey = Path.getPathWithoutSchemeAndAuthority(entry.getKey());
      StringInternUtils.internUriStringsInPath(newKey);
      result.put(newKey, entry.getValue());
    }
    return result;
  }
  /**
   * Create a generic Hive RecordReader than can iterate over all chunks in a
   * CombinedFileSplit.
   */
  @Override
  public RecordReader getRecordReader(InputSplit split, JobConf job,
      Reporter reporter) throws IOException {
    if (!(split instanceof CombineHiveInputSplit)) {
      return super.getRecordReader(split, job, reporter);
    }
    CombineHiveInputSplit hsplit = (CombineHiveInputSplit) split;
    String inputFormatClassName = null;
    Class inputFormatClass = null;
    try {
      inputFormatClassName = hsplit.inputFormatClassName();
      inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
      throw new IOException("cannot find class " + inputFormatClassName);
    }
    pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath(0));
    return ShimLoader.getHadoopShims().getCombineFileInputFormat()
        .getRecordReader(job,
            (CombineFileSplit) split, reporter,
            CombineHiveRecordReader.class);
  }
  static class CombineFilter implements PathFilter {
    private final Set<String> pStrings = new HashSet<String>();
    // store a path prefix in this TestFilter
    // PRECONDITION: p should always be a directory
    public CombineFilter(Path p) {
      // we need to keep the path part only because the Hadoop CombineFileInputFormat will
      // pass the path part only to accept().
      // Trailing the path with a separator to prevent partial matching.
      addPath(p);
    }
    public void addPath(Path p) {
      String pString = p.toUri().getPath();
      pStrings.add(pString);
    }
    // returns true if the specified path matches the prefix stored
    // in this TestFilter.
    @Override
    public boolean accept(Path path) {
      boolean find = false;
      while (path != null && !find) {
        if (pStrings.contains(path.toUri().getPath())) {
          find = true;
          break;
        }
        path = path.getParent();
      }
      return find;
    }
    @Override
    public String toString() {
      StringBuilder s = new StringBuilder();
      s.append("PathFilter: ");
      for (String pString : pStrings) {
        s.append(pString + " ");
      }
      return s.toString();
    }
  }
  /**
   * This is a marker interface that is used to identify the formats where
   * combine split generation is not applicable
   */
  public interface AvoidSplitCombination {
    boolean shouldSkipCombine(Path path, Configuration conf) throws IOException;
  }
  /**
   * **MOD** this is the implementation of CombineFileInputFormat which is a copy of
   * org.apache.hadoop.hive.shims.HadoopShimsSecure.CombineFileInputFormatShim
   * with changes in listStatus
   */
  public static class HoodieCombineFileInputFormatShim<K, V> extends CombineFileInputFormat<K, V>
      implements org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim<K, V> {
    private boolean hoodieFilter = false;
    private boolean isRealTime = false;
    public HoodieCombineFileInputFormatShim() {
    }
    public Path[] getInputPathsShim(JobConf conf) {
      try {
        return FileInputFormat.getInputPaths(conf);
      } catch (Exception var3) {
        throw new RuntimeException(var3);
      }
    }
    public void createPool(JobConf conf, PathFilter... filters) {
      super.createPool(conf, filters);
    }
    @Override
    public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
        throws IOException {
      throw new IOException("CombineFileInputFormat.getRecordReader not needed.");
    }
    protected List<FileStatus> listStatus(JobContext job) throws IOException {
      LOG.info("Listing status in HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim");
      List<FileStatus> result;
      if (hoodieFilter) {
        HoodieInputFormat input;
        if (isRealTime) {
          LOG.info("Using HoodieRealtimeInputFormat");
          input = new HoodieRealtimeInputFormat();
        } else {
          LOG.info("Using HoodieInputFormat");
          input = new HoodieInputFormat();
        }
        input.setConf(job.getConfiguration());
        result = new ArrayList<FileStatus>(
            Arrays.asList(input.listStatus(new JobConf(job.getConfiguration()))));
      } else {
        result = super.listStatus(job);
      }
      Iterator it = result.iterator();
      while (it.hasNext()) {
        FileStatus stat = (FileStatus) it.next();
        if (!stat.isFile()) {
          it.remove();
        }
      }
      return result;
    }
    public CombineFileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 0L);
      if (job.getLong("mapreduce.input.fileinputformat.split.minsize.per.node", 0L) == 0L) {
        super.setMinSplitSizeNode(minSize);
      }
      if (job.getLong("mapreduce.input.fileinputformat.split.minsize.per.rack", 0L) == 0L) {
        super.setMinSplitSizeRack(minSize);
      }
      if (job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, 0L) == 0L) {
        super.setMaxSplitSize(minSize);
      }
      InputSplit[] splits = super.getSplits(job, numSplits);
      ArrayList inputSplitShims = new ArrayList();
      for (int pos = 0; pos < splits.length; ++pos) {
        CombineFileSplit split = (CombineFileSplit) splits[pos];
        if (split.getPaths().length > 0) {
          inputSplitShims.add(
              new HadoopShimsSecure.InputSplitShim(job, split.getPaths(), split.getStartOffsets(),
                  split.getLengths(), split.getLocations()));
        }
      }
      return (CombineFileSplit[]) inputSplitShims
          .toArray(new HadoopShimsSecure.InputSplitShim[inputSplitShims.size()]);
    }
    public HadoopShimsSecure.InputSplitShim getInputSplitShim() throws IOException {
      return new HadoopShimsSecure.InputSplitShim();
    }
    public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
        Class<RecordReader<K, V>> rrClass) throws IOException {
      return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
    }
    public void setHoodieFilter(boolean hoodieFilter) {
      this.hoodieFilter = hoodieFilter;
    }
    public void setRealTime(boolean realTime) {
      isRealTime = realTime;
    }
  }
 }