[HUDI-764] [HUDI-765] ORC reader writer Implementation (#2999)

Co-authored-by: Qingyun (Teresa) Kang <kteresa@uber.com>
2021-06-15 15:21:43 -07:00
parent cb642ceb75
commit b8fe5b91d5
29 changed files with 2268 additions and 91 deletions
--- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileFormat.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieFileFormat.java
@@ -24,7 +24,8 @@ package org.apache.hudi.common.model;
 public enum HoodieFileFormat {
  PARQUET(".parquet"),
  HOODIE_LOG(".log"),
-  HFILE(".hfile");
+  HFILE(".hfile"),
+  ORC(".orc");

  private final String extension;

--- a/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/AvroOrcUtils.java
@@ -0,0 +1,799 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.sql.Timestamp;
+import java.util.Base64;
+import java.util.Date;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.avro.Conversions;
+import org.apache.avro.LogicalType;
+import org.apache.avro.LogicalTypes;
+import org.apache.avro.Schema.Field;
+import org.apache.avro.generic.GenericData;
+import java.nio.charset.StandardCharsets;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData.StringType;
+import org.apache.avro.util.Utf8;
+import org.apache.orc.storage.common.type.HiveDecimal;
+import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
+import org.apache.orc.storage.ql.exec.vector.ColumnVector;
+import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector;
+import org.apache.orc.storage.ql.exec.vector.DoubleColumnVector;
+import org.apache.orc.storage.ql.exec.vector.ListColumnVector;
+import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
+import org.apache.orc.storage.ql.exec.vector.MapColumnVector;
+import org.apache.orc.storage.ql.exec.vector.StructColumnVector;
+import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector;
+import org.apache.orc.storage.ql.exec.vector.UnionColumnVector;
+import org.apache.orc.storage.serde2.io.DateWritable;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.orc.TypeDescription;
+
+/**
+ * Methods including addToVector, addUnionValue, createOrcSchema are originally from
+ * https://github.com/streamsets/datacollector.
+ * Source classes:
+ * - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcRecordConverter
+ * - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcSchemaConverter
+ *
+ * Changes made:
+ * 1. Flatten nullable Avro schema type when the value is not null in `addToVector`.
+ * 2. Use getLogicalType(), constants from LogicalTypes instead of getJsonProp() to handle Avro logical types.
+ */
+public class AvroOrcUtils {
+
+  private static final int MICROS_PER_MILLI = 1000;
+  private static final int NANOS_PER_MICRO = 1000;
+
+  /**
+   * Add an object (of a given ORC type) to the column vector at a given position.
+   *
+   * @param type        ORC schema of the value Object.
+   * @param colVector   The column vector to store the value Object.
+   * @param avroSchema  Avro schema of the value Object.
+   *                    Only used to check logical types for timestamp unit conversion.
+   * @param value       Object to be added to the column vector
+   * @param vectorPos   The position in the vector where value will be stored at.
+   */
+  public static void addToVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, Object value, int vectorPos) {
+
+    final int currentVecLength = colVector.isNull.length;
+    if (vectorPos >= currentVecLength) {
+      colVector.ensureSize(2 * currentVecLength, true);
+    }
+    if (value == null) {
+      colVector.isNull[vectorPos] = true;
+      colVector.noNulls = false;
+      return;
+    }
+
+    if (avroSchema.getType().equals(Schema.Type.UNION)) {
+      avroSchema = getActualSchemaType(avroSchema);
+    }
+
+    LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
+
+    switch (type.getCategory()) {
+      case BOOLEAN:
+        LongColumnVector boolVec = (LongColumnVector) colVector;
+        boolVec.vector[vectorPos] = (boolean) value ? 1 : 0;
+        break;
+      case BYTE:
+        LongColumnVector byteColVec = (LongColumnVector) colVector;
+        byteColVec.vector[vectorPos] = (byte) value;
+        break;
+      case SHORT:
+        LongColumnVector shortColVec = (LongColumnVector) colVector;
+        shortColVec.vector[vectorPos] = (short) value;
+        break;
+      case INT:
+        // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but we will ignore that fact here
+        // since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the int value
+        LongColumnVector intColVec = (LongColumnVector) colVector;
+        intColVec.vector[vectorPos] = (int) value;
+        break;
+      case LONG:
+        // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but we will ignore that fact here
+        // since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the long value
+        LongColumnVector longColVec = (LongColumnVector) colVector;
+        longColVec.vector[vectorPos] = (long) value;
+        break;
+      case FLOAT:
+        DoubleColumnVector floatColVec = (DoubleColumnVector) colVector;
+        floatColVec.vector[vectorPos] = (float) value;
+        break;
+      case DOUBLE:
+        DoubleColumnVector doubleColVec = (DoubleColumnVector) colVector;
+        doubleColVec.vector[vectorPos] = (double) value;
+        break;
+      case VARCHAR:
+      case CHAR:
+      case STRING:
+        BytesColumnVector bytesColVec = (BytesColumnVector) colVector;
+        byte[] bytes = null;
+
+        if (value instanceof String) {
+          bytes = ((String) value).getBytes(StandardCharsets.UTF_8);
+        } else if (value instanceof Utf8) {
+          final Utf8 utf8 = (Utf8) value;
+          bytes = utf8.getBytes();
+        } else if (value instanceof GenericData.EnumSymbol) {
+          bytes = ((GenericData.EnumSymbol) value).toString().getBytes(StandardCharsets.UTF_8);
+        } else {
+          throw new IllegalStateException(String.format(
+              "Unrecognized type for Avro %s field value, which has type %s, value %s",
+              type.getCategory().getName(),
+              value.getClass().getName(),
+              value.toString()
+          ));
+        }
+
+        if (bytes == null) {
+          bytesColVec.isNull[vectorPos] = true;
+          bytesColVec.noNulls = false;
+        } else {
+          bytesColVec.setRef(vectorPos, bytes, 0, bytes.length);
+        }
+        break;
+      case DATE:
+        LongColumnVector dateColVec = (LongColumnVector) colVector;
+        int daysSinceEpoch;
+        if (logicalType instanceof LogicalTypes.Date) {
+          daysSinceEpoch = (int) value;
+        } else if (value instanceof java.sql.Date) {
+          daysSinceEpoch = DateWritable.dateToDays((java.sql.Date) value);
+        } else if (value instanceof Date) {
+          daysSinceEpoch = DateWritable.millisToDays(((Date) value).getTime());
+        } else {
+          throw new IllegalStateException(String.format(
+              "Unrecognized type for Avro DATE field value, which has type %s, value %s",
+              value.getClass().getName(),
+              value.toString()
+          ));
+        }
+        dateColVec.vector[vectorPos] = daysSinceEpoch;
+        break;
+      case TIMESTAMP:
+        TimestampColumnVector tsColVec = (TimestampColumnVector) colVector;
+
+        long time;
+        int nanos = 0;
+
+        // The unit for Timestamp in ORC is millis, convert timestamp to millis if needed
+        if (logicalType instanceof LogicalTypes.TimestampMillis) {
+          time = (long) value;
+        } else if (logicalType instanceof LogicalTypes.TimestampMicros) {
+          final long logicalTsValue = (long) value;
+          time = logicalTsValue / MICROS_PER_MILLI;
+          nanos = NANOS_PER_MICRO * ((int) (logicalTsValue % MICROS_PER_MILLI));
+        } else if (value instanceof Timestamp) {
+          Timestamp tsValue = (Timestamp) value;
+          time = tsValue.getTime();
+          nanos = tsValue.getNanos();
+        } else if (value instanceof java.sql.Date) {
+          java.sql.Date sqlDateValue = (java.sql.Date) value;
+          time = sqlDateValue.getTime();
+        } else if (value instanceof Date) {
+          Date dateValue = (Date) value;
+          time = dateValue.getTime();
+        } else {
+          throw new IllegalStateException(String.format(
+              "Unrecognized type for Avro TIMESTAMP field value, which has type %s, value %s",
+              value.getClass().getName(),
+              value.toString()
+          ));
+        }
+
+        tsColVec.time[vectorPos] = time;
+        tsColVec.nanos[vectorPos] = nanos;
+        break;
+      case BINARY:
+        BytesColumnVector binaryColVec = (BytesColumnVector) colVector;
+
+        byte[] binaryBytes;
+        if (value instanceof GenericData.Fixed) {
+          binaryBytes = ((GenericData.Fixed)value).bytes();
+        } else if (value instanceof ByteBuffer) {
+          final ByteBuffer byteBuffer = (ByteBuffer) value;
+          binaryBytes = new byte[byteBuffer.remaining()];
+          byteBuffer.get(binaryBytes);
+        } else if (value instanceof byte[]) {
+          binaryBytes = (byte[]) value;
+        } else {
+          throw new IllegalStateException(String.format(
+              "Unrecognized type for Avro BINARY field value, which has type %s, value %s",
+              value.getClass().getName(),
+              value.toString()
+          ));
+        }
+        binaryColVec.setRef(vectorPos, binaryBytes, 0, binaryBytes.length);
+        break;
+      case DECIMAL:
+        DecimalColumnVector decimalColVec = (DecimalColumnVector) colVector;
+        HiveDecimal decimalValue;
+        if (value instanceof BigDecimal) {
+          final BigDecimal decimal = (BigDecimal) value;
+          decimalValue = HiveDecimal.create(decimal);
+        } else if (value instanceof ByteBuffer) {
+          final ByteBuffer byteBuffer = (ByteBuffer) value;
+          final byte[] decimalBytes = new byte[byteBuffer.remaining()];
+          byteBuffer.get(decimalBytes);
+          final BigInteger bigInt = new BigInteger(decimalBytes);
+          final int scale = type.getScale();
+          BigDecimal bigDecVal = new BigDecimal(bigInt, scale);
+
+          decimalValue = HiveDecimal.create(bigDecVal);
+          if (decimalValue == null && decimalBytes.length > 0) {
+            throw new IllegalStateException(
+                "Unexpected read null HiveDecimal from bytes (base-64 encoded): "
+                    + Base64.getEncoder().encodeToString(decimalBytes)
+            );
+          }
+        } else if (value instanceof GenericData.Fixed) {
+          final BigDecimal decimal = new Conversions.DecimalConversion()
+              .fromFixed((GenericData.Fixed) value, avroSchema, logicalType);
+          decimalValue = HiveDecimal.create(decimal);
+        } else {
+          throw new IllegalStateException(String.format(
+              "Unexpected type for decimal (%s), cannot convert from Avro value",
+              value.getClass().getCanonicalName()
+          ));
+        }
+        if (decimalValue == null) {
+          decimalColVec.isNull[vectorPos] = true;
+          decimalColVec.noNulls = false;
+        } else {
+          decimalColVec.set(vectorPos, decimalValue);
+        }
+        break;
+      case LIST:
+        List<?> list = (List<?>) value;
+        ListColumnVector listColVec = (ListColumnVector) colVector;
+        listColVec.offsets[vectorPos] = listColVec.childCount;
+        listColVec.lengths[vectorPos] = list.size();
+
+        TypeDescription listType = type.getChildren().get(0);
+        for (Object listItem : list) {
+          addToVector(listType, listColVec.child, avroSchema.getElementType(), listItem, listColVec.childCount++);
+        }
+        break;
+      case MAP:
+        Map<String, ?> mapValue = (Map<String, ?>) value;
+
+        MapColumnVector mapColumnVector = (MapColumnVector) colVector;
+        mapColumnVector.offsets[vectorPos] = mapColumnVector.childCount;
+        mapColumnVector.lengths[vectorPos] = mapValue.size();
+
+        // keys are always strings
+        Schema keySchema = Schema.create(Schema.Type.STRING);
+        for (Map.Entry<String, ?> entry : mapValue.entrySet()) {
+          addToVector(
+              type.getChildren().get(0),
+              mapColumnVector.keys,
+              keySchema,
+              entry.getKey(),
+              mapColumnVector.childCount
+          );
+
+          addToVector(
+              type.getChildren().get(1),
+              mapColumnVector.values,
+              avroSchema.getValueType(),
+              entry.getValue(),
+              mapColumnVector.childCount
+          );
+
+          mapColumnVector.childCount++;
+        }
+
+        break;
+      case STRUCT:
+        StructColumnVector structColVec = (StructColumnVector) colVector;
+
+        GenericData.Record record = (GenericData.Record) value;
+
+        for (int i = 0; i < type.getFieldNames().size(); i++) {
+          String fieldName = type.getFieldNames().get(i);
+          Object fieldValue = record.get(fieldName);
+          TypeDescription fieldType = type.getChildren().get(i);
+          addToVector(fieldType, structColVec.fields[i], avroSchema.getFields().get(i).schema(), fieldValue, vectorPos);
+        }
+
+        break;
+      case UNION:
+        UnionColumnVector unionColVec = (UnionColumnVector) colVector;
+
+        List<TypeDescription> childTypes = type.getChildren();
+        boolean added = addUnionValue(unionColVec, childTypes, avroSchema, value, vectorPos);
+
+        if (!added) {
+          throw new IllegalStateException(String.format(
+              "Failed to add value %s to union with type %s",
+              value == null ? "null" : value.toString(),
+              type.toString()
+          ));
+        }
+
+        break;
+      default:
+        throw new IllegalArgumentException("Invalid TypeDescription " + type.toString() + ".");
+    }
+  }
+
+  /**
+   * Match value with its ORC type and add to the union vector at a given position.
+   *
+   * @param unionVector       The vector to store value.
+   * @param unionChildTypes   All possible types for the value Object.
+   * @param avroSchema        Avro union schema for the value Object.
+   * @param value             Object to be added to the unionVector
+   * @param vectorPos         The position in the vector where value will be stored at.
+   * @return                  succeeded or failed
+   */
+  public static boolean addUnionValue(
+      UnionColumnVector unionVector,
+      List<TypeDescription> unionChildTypes,
+      Schema avroSchema,
+      Object value,
+      int vectorPos
+  ) {
+    int matchIndex = -1;
+    TypeDescription matchType = null;
+    Object matchValue = null;
+
+    for (int t = 0; t < unionChildTypes.size(); t++) {
+      TypeDescription childType = unionChildTypes.get(t);
+      boolean matches = false;
+
+      switch (childType.getCategory()) {
+        case BOOLEAN:
+          matches = value instanceof Boolean;
+          break;
+        case BYTE:
+          matches = value instanceof Byte;
+          break;
+        case SHORT:
+          matches = value instanceof Short;
+          break;
+        case INT:
+          matches = value instanceof Integer;
+          break;
+        case LONG:
+          matches = value instanceof Long;
+          break;
+        case FLOAT:
+          matches = value instanceof Float;
+          break;
+        case DOUBLE:
+          matches = value instanceof Double;
+          break;
+        case STRING:
+        case VARCHAR:
+        case CHAR:
+          if (value instanceof String) {
+            matches = true;
+            matchValue = ((String) value).getBytes(StandardCharsets.UTF_8);
+          } else if (value instanceof Utf8) {
+            matches = true;
+            matchValue = ((Utf8) value).getBytes();
+          }
+          break;
+        case DATE:
+          matches = value instanceof Date;
+          break;
+        case TIMESTAMP:
+          matches = value instanceof Timestamp;
+          break;
+        case BINARY:
+          matches = value instanceof byte[] || value instanceof GenericData.Fixed;
+          break;
+        case DECIMAL:
+          matches = value instanceof BigDecimal;
+          break;
+        case LIST:
+          matches = value instanceof List;
+          break;
+        case MAP:
+          matches = value instanceof Map;
+          break;
+        case STRUCT:
+          throw new UnsupportedOperationException("Cannot handle STRUCT within UNION.");
+        case UNION:
+          List<TypeDescription> children = childType.getChildren();
+          if (value == null) {
+            matches = children == null || children.size() == 0;
+          } else {
+            matches = addUnionValue(unionVector, children, avroSchema, value, vectorPos);
+          }
+          break;
+        default:
+          throw new IllegalArgumentException("Invalid TypeDescription " + childType.getCategory().toString() + ".");
+      }
+
+      if (matches) {
+        matchIndex = t;
+        matchType = childType;
+        break;
+      }
+    }
+
+    if (value == null && matchValue != null) {
+      value = matchValue;
+    }
+
+    if (matchIndex >= 0) {
+      unionVector.tags[vectorPos] = matchIndex;
+      if (value == null) {
+        unionVector.isNull[vectorPos] = true;
+        unionVector.noNulls = false;
+      } else {
+        addToVector(matchType, unionVector.fields[matchIndex], avroSchema.getTypes().get(matchIndex), value, vectorPos);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Read the Column vector at a given position conforming to a given ORC schema.
+   *
+   * @param type        ORC schema of the object to read.
+   * @param colVector   The column vector to read.
+   * @param avroSchema  Avro schema of the object to read.
+   *                    Only used to check logical types for timestamp unit conversion.
+   * @param vectorPos   The position in the vector where the value to read is stored at.
+   * @return            The object being read.
+   */
+  public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, int vectorPos) {
+
+    if (colVector.isRepeating) {
+      vectorPos = 0;
+    }
+
+    if (colVector.isNull[vectorPos]) {
+      return null;
+    }
+
+    if (avroSchema.getType().equals(Schema.Type.UNION)) {
+      avroSchema = getActualSchemaType(avroSchema);
+    }
+    LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
+
+    switch (type.getCategory()) {
+      case BOOLEAN:
+        return ((LongColumnVector) colVector).vector[vectorPos] != 0;
+      case BYTE:
+        return (byte) ((LongColumnVector) colVector).vector[vectorPos];
+      case SHORT:
+        return (short) ((LongColumnVector) colVector).vector[vectorPos];
+      case INT:
+        return (int) ((LongColumnVector) colVector).vector[vectorPos];
+      case LONG:
+        return ((LongColumnVector) colVector).vector[vectorPos];
+      case FLOAT:
+        return (float) ((DoubleColumnVector) colVector).vector[vectorPos];
+      case DOUBLE:
+        return ((DoubleColumnVector) colVector).vector[vectorPos];
+      case VARCHAR:
+      case CHAR:
+        int maxLength = type.getMaxLength();
+        String result = ((BytesColumnVector) colVector).toString(vectorPos);
+        if (result.length() <= maxLength) {
+          return result;
+        } else {
+          throw new HoodieIOException("CHAR/VARCHAR has length " + result.length() + " greater than Max Length allowed");
+        }
+      case STRING:
+        String stringType = avroSchema.getProp(GenericData.STRING_PROP);
+        if (stringType == null || !stringType.equals(StringType.String)) {
+          int stringLength = ((BytesColumnVector) colVector).length[vectorPos];
+          int stringOffset = ((BytesColumnVector) colVector).start[vectorPos];
+          byte[] stringBytes = new byte[stringLength];
+          System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], stringOffset, stringBytes, 0, stringLength);
+          return new Utf8(stringBytes);
+        } else {
+          return ((BytesColumnVector) colVector).toString(vectorPos);
+        }
+      case DATE:
+        // convert to daysSinceEpoch for LogicalType.Date
+        return (int) ((LongColumnVector) colVector).vector[vectorPos];
+      case TIMESTAMP:
+        // The unit of time in ORC is millis. Convert (time,nanos) to the desired unit per logicalType
+        long time = ((TimestampColumnVector) colVector).time[vectorPos];
+        int nanos = ((TimestampColumnVector) colVector).nanos[vectorPos];
+        if (logicalType instanceof LogicalTypes.TimestampMillis) {
+          return time;
+        } else if (logicalType instanceof LogicalTypes.TimestampMicros) {
+          return time * MICROS_PER_MILLI + nanos / NANOS_PER_MICRO;
+        } else {
+          return ((TimestampColumnVector) colVector).getTimestampAsLong(vectorPos);
+        }
+      case BINARY:
+        int binaryLength = ((BytesColumnVector) colVector).length[vectorPos];
+        int binaryOffset = ((BytesColumnVector) colVector).start[vectorPos];
+        byte[] binaryBytes = new byte[binaryLength];
+        System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], binaryOffset, binaryBytes, 0, binaryLength);
+        // return a ByteBuffer to be consistent with AvroRecordConverter
+        return ByteBuffer.wrap(binaryBytes);
+      case DECIMAL:
+        // HiveDecimal always ignores trailing zeros, thus modifies the scale implicitly,
+        // therefore, the scale must be enforced here.
+        BigDecimal bigDecimal = ((DecimalColumnVector) colVector).vector[vectorPos]
+            .getHiveDecimal().bigDecimalValue()
+            .setScale(((LogicalTypes.Decimal) logicalType).getScale());
+        Schema.Type baseType = avroSchema.getType();
+        if (baseType.equals(Schema.Type.FIXED)) {
+          return new Conversions.DecimalConversion().toFixed(bigDecimal, avroSchema, logicalType);
+        } else if (baseType.equals(Schema.Type.BYTES)) {
+          return bigDecimal.unscaledValue().toByteArray();
+        } else {
+          throw new HoodieIOException(baseType.getName() + "is not a valid type for LogicalTypes.DECIMAL.");
+        }
+      case LIST:
+        ArrayList<Object> list = new ArrayList<>();
+        ListColumnVector listVector = (ListColumnVector) colVector;
+        int listLength = (int) listVector.lengths[vectorPos];
+        int listOffset = (int) listVector.offsets[vectorPos];
+        list.ensureCapacity(listLength);
+        TypeDescription childType = type.getChildren().get(0);
+        for (int i = 0; i < listLength; i++) {
+          list.add(readFromVector(childType, listVector.child, avroSchema.getElementType(), listOffset + i));
+        }
+        return list;
+      case MAP:
+        Map<String, Object> map = new HashMap<String, Object>();
+        MapColumnVector mapVector = (MapColumnVector) colVector;
+        int mapLength = (int) mapVector.lengths[vectorPos];
+        int mapOffset = (int) mapVector.offsets[vectorPos];
+        // keys are always strings for maps in Avro
+        Schema keySchema = Schema.create(Schema.Type.STRING);
+        for (int i = 0; i < mapLength; i++) {
+          map.put(
+              readFromVector(type.getChildren().get(0), mapVector.keys, keySchema, i + mapOffset).toString(),
+              readFromVector(type.getChildren().get(1), mapVector.values,
+                  avroSchema.getValueType(), i + mapOffset));
+        }
+        return map;
+      case STRUCT:
+        StructColumnVector structVector = (StructColumnVector) colVector;
+        List<TypeDescription> children = type.getChildren();
+        GenericData.Record record = new GenericData.Record(avroSchema);
+        for (int i = 0; i < children.size(); i++) {
+          record.put(i, readFromVector(children.get(i), structVector.fields[i],
+              avroSchema.getFields().get(i).schema(), vectorPos));
+        }
+        return record;
+      case UNION:
+        UnionColumnVector unionVector = (UnionColumnVector) colVector;
+        int tag = unionVector.tags[vectorPos];
+        ColumnVector fieldVector = unionVector.fields[tag];
+        return readFromVector(type.getChildren().get(tag), fieldVector, avroSchema.getTypes().get(tag), vectorPos);
+      default:
+        throw new HoodieIOException("Unrecognized TypeDescription " + type.toString());
+    }
+  }
+
+  public static TypeDescription createOrcSchema(Schema avroSchema) {
+
+    LogicalType logicalType = avroSchema.getLogicalType();
+
+    if (logicalType != null) {
+      if (logicalType instanceof LogicalTypes.Decimal) {
+        return TypeDescription.createDecimal()
+            .withPrecision(((LogicalTypes.Decimal) logicalType).getPrecision())
+            .withScale(((LogicalTypes.Decimal) logicalType).getScale());
+      } else if (logicalType instanceof LogicalTypes.Date) {
+        // The date logical type represents a date within the calendar, with no reference to a particular time zone
+        // or time of day.
+        //
+        // A date logical type annotates an Avro int, where the int stores the number of days from the unix epoch, 1
+        // January 1970 (ISO calendar).
+        return TypeDescription.createDate();
+      } else if (logicalType instanceof LogicalTypes.TimeMillis) {
+        // The time-millis logical type represents a time of day, with no reference to a particular calendar, time
+        // zone or date, with a precision of one millisecond.
+        //
+        // A time-millis logical type annotates an Avro int, where the int stores the number of milliseconds after
+        // midnight, 00:00:00.000.
+        return TypeDescription.createInt();
+      } else if (logicalType instanceof LogicalTypes.TimeMicros) {
+        // The time-micros logical type represents a time of day, with no reference to a particular calendar, time
+        // zone or date, with a precision of one microsecond.
+        //
+        // A time-micros logical type annotates an Avro long, where the long stores the number of microseconds after
+        // midnight, 00:00:00.000000.
+        return TypeDescription.createLong();
+      } else if (logicalType instanceof LogicalTypes.TimestampMillis) {
+        // The timestamp-millis logical type represents an instant on the global timeline, independent of a
+        // particular time zone or calendar, with a precision of one millisecond.
+        //
+        // A timestamp-millis logical type annotates an Avro long, where the long stores the number of milliseconds
+        // from the unix epoch, 1 January 1970 00:00:00.000 UTC.
+        return TypeDescription.createTimestamp();
+      } else if (logicalType instanceof LogicalTypes.TimestampMicros) {
+        // The timestamp-micros logical type represents an instant on the global timeline, independent of a
+        // particular time zone or calendar, with a precision of one microsecond.
+        //
+        // A timestamp-micros logical type annotates an Avro long, where the long stores the number of microseconds
+        // from the unix epoch, 1 January 1970 00:00:00.000000 UTC.
+        return TypeDescription.createTimestamp();
+      }
+    }
+
+    final Schema.Type type = avroSchema.getType();
+    switch (type) {
+      case NULL:
+        // empty union represents null type
+        final TypeDescription nullUnion = TypeDescription.createUnion();
+        return nullUnion;
+      case LONG:
+        return TypeDescription.createLong();
+      case INT:
+        return TypeDescription.createInt();
+      case BYTES:
+        return TypeDescription.createBinary();
+      case ARRAY:
+        return TypeDescription.createList(createOrcSchema(avroSchema.getElementType()));
+      case RECORD:
+        final TypeDescription recordStruct = TypeDescription.createStruct();
+        for (Schema.Field field : avroSchema.getFields()) {
+          final Schema fieldSchema = field.schema();
+          final TypeDescription fieldType = createOrcSchema(fieldSchema);
+          if (fieldType != null) {
+            recordStruct.addField(field.name(), fieldType);
+          }
+        }
+        return recordStruct;
+      case MAP:
+        return TypeDescription.createMap(
+            // in Avro maps, keys are always strings
+            TypeDescription.createString(),
+            createOrcSchema(avroSchema.getValueType())
+        );
+      case UNION:
+        final List<Schema> nonNullMembers = avroSchema.getTypes().stream().filter(
+            schema -> !Schema.Type.NULL.equals(schema.getType())
+        ).collect(Collectors.toList());
+
+        if (nonNullMembers.isEmpty()) {
+          // no non-null union members; represent as an ORC empty union
+          return TypeDescription.createUnion();
+        } else if (nonNullMembers.size() == 1) {
+          // a single non-null union member
+          // this is how Avro represents "nullable" types; as a union of the NULL type with another
+          // since ORC already supports nullability of all types, just use the child type directly
+          return createOrcSchema(nonNullMembers.get(0));
+        } else {
+          // more than one non-null type; represent as an actual ORC union of them
+          final TypeDescription union = TypeDescription.createUnion();
+          for (final Schema childSchema : nonNullMembers) {
+            union.addUnionChild(createOrcSchema(childSchema));
+          }
+          return union;
+        }
+      case STRING:
+        return TypeDescription.createString();
+      case FLOAT:
+        return TypeDescription.createFloat();
+      case DOUBLE:
+        return TypeDescription.createDouble();
+      case BOOLEAN:
+        return TypeDescription.createBoolean();
+      case ENUM:
+        // represent as String for now
+        return TypeDescription.createString();
+      case FIXED:
+        return TypeDescription.createBinary();
+      default:
+        throw new IllegalStateException(String.format("Unrecognized Avro type: %s", type.getName()));
+    }
+  }
+
+  public static Schema createAvroSchema(TypeDescription orcSchema) {
+    switch (orcSchema.getCategory()) {
+      case BOOLEAN:
+        return Schema.create(Schema.Type.BOOLEAN);
+      case BYTE:
+        // tinyint (8 bit), use int to hold it
+        return Schema.create(Schema.Type.INT);
+      case SHORT:
+        // smallint (16 bit), use int to hold it
+        return Schema.create(Schema.Type.INT);
+      case INT:
+        // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish
+        return Schema.create(Schema.Type.INT);
+      case LONG:
+        // the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish
+        return Schema.create(Schema.Type.LONG);
+      case FLOAT:
+        return Schema.create(Schema.Type.FLOAT);
+      case DOUBLE:
+        return Schema.create(Schema.Type.DOUBLE);
+      case VARCHAR:
+      case CHAR:
+      case STRING:
+        return Schema.create(Schema.Type.STRING);
+      case DATE:
+        Schema date = Schema.create(Schema.Type.INT);
+        LogicalTypes.date().addToSchema(date);
+        return date;
+      case TIMESTAMP:
+        // Cannot distinguish between TIMESTAMP_MILLIS and TIMESTAMP_MICROS
+        // Assume TIMESTAMP_MILLIS because Timestamp in ORC is in millis
+        Schema timestamp = Schema.create(Schema.Type.LONG);
+        LogicalTypes.timestampMillis().addToSchema(timestamp);
+        return timestamp;
+      case BINARY:
+        return Schema.create(Schema.Type.BYTES);
+      case DECIMAL:
+        Schema decimal = Schema.create(Schema.Type.BYTES);
+        LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal);
+        return decimal;
+      case LIST:
+        return Schema.createArray(createAvroSchema(orcSchema.getChildren().get(0)));
+      case MAP:
+        return Schema.createMap(createAvroSchema(orcSchema.getChildren().get(1)));
+      case STRUCT:
+        List<Field> childFields = new ArrayList<>();
+        for (int i = 0; i < orcSchema.getChildren().size(); i++) {
+          TypeDescription childType = orcSchema.getChildren().get(i);
+          String childName = orcSchema.getFieldNames().get(i);
+          childFields.add(new Field(childName, createAvroSchema(childType), "", null));
+        }
+        return Schema.createRecord(childFields);
+      case UNION:
+        return Schema.createUnion(orcSchema.getChildren().stream()
+            .map(AvroOrcUtils::createAvroSchema)
+            .collect(Collectors.toList()));
+      default:
+        throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName()));
+    }
+  }
+
+  /**
+   * Returns the actual schema of a field.
+   *
+   * All types in ORC is nullable whereas Avro uses a union that contains the NULL type to imply
+   * the nullability of an Avro type. To achieve consistency between the Avro and ORC schema,
+   * non-NULL types are extracted from the union type.
+   * @param unionSchema       A schema of union type.
+   * @return  An Avro schema that is either NULL or a UNION without NULL fields.
+   */
+  private static Schema getActualSchemaType(Schema unionSchema) {
+    final List<Schema> nonNullMembers = unionSchema.getTypes().stream().filter(
+        schema -> !Schema.Type.NULL.equals(schema.getType())
+    ).collect(Collectors.toList());
+    if (nonNullMembers.isEmpty()) {
+      return Schema.create(Schema.Type.NULL);
+    } else if (nonNullMembers.size() == 1) {
+      return nonNullMembers.get(0);
+    } else {
+      return Schema.createUnion(nonNullMembers);
+    }
+  }
+}
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java
@@ -18,6 +18,7 @@

 package org.apache.hudi.common.util;

+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -25,16 +26,22 @@ import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hudi.avro.HoodieAvroWriteSupport;
 import org.apache.hudi.common.bloom.BloomFilter;
+import org.apache.hudi.common.bloom.BloomFilterFactory;
+import org.apache.hudi.common.bloom.BloomFilterTypeCode;
 import org.apache.hudi.common.model.HoodieFileFormat;
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.exception.HoodieException;

 public abstract class BaseFileUtils {

  public static BaseFileUtils getInstance(String path) {
    if (path.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
      return new ParquetUtils();
+    } else if (path.endsWith(HoodieFileFormat.ORC.getFileExtension())) {
+      return new OrcUtils();
    }
    throw new UnsupportedOperationException("The format for file " + path + " is not supported yet.");
  }
@@ -42,6 +49,8 @@ public abstract class BaseFileUtils {
  public static BaseFileUtils getInstance(HoodieFileFormat fileFormat) {
    if (HoodieFileFormat.PARQUET.equals(fileFormat)) {
      return new ParquetUtils();
+    } else if (HoodieFileFormat.ORC.equals(fileFormat)) {
+      return new OrcUtils();
    }
    throw new UnsupportedOperationException(fileFormat.name() + " format not supported yet.");
  }
@@ -50,24 +59,122 @@ public abstract class BaseFileUtils {
    return getInstance(metaClient.getTableConfig().getBaseFileFormat());
  }

-  public abstract Set<String> readRowKeys(Configuration configuration, Path filePath);
+  /**
+   * Read the rowKey list from the given data file.
+   * @param filePath      The data file path
+   * @param configuration configuration to build fs object
+   * @return Set Set of row keys
+   */
+  public Set<String> readRowKeys(Configuration configuration, Path filePath) {
+    return filterRowKeys(configuration, filePath, new HashSet<>());
+  }

-  public abstract Set<String> filterRowKeys(Configuration configuration, Path filePath, Set<String> filter);
+  /**
+   * Read the bloom filter from the metadata of the given data file.
+   * @param configuration Configuration
+   * @param filePath The data file path
+   * @return a BloomFilter object
+   */
+  public BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path filePath) {
+    Map<String, String> footerVals =
+        readFooter(configuration, false, filePath,
+            HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
+            HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
+            HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE);
+    String footerVal = footerVals.get(HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
+    if (null == footerVal) {
+      // We use old style key "com.uber.hoodie.bloomfilter"
+      footerVal = footerVals.get(HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
+    }
+    BloomFilter toReturn = null;
+    if (footerVal != null) {
+      if (footerVals.containsKey(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) {
+        toReturn = BloomFilterFactory.fromString(footerVal,
+            footerVals.get(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE));
+      } else {
+        toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name());
+      }
+    }
+    return toReturn;
+  }

-  public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
-
-  public abstract Schema readAvroSchema(Configuration configuration, Path filePath);
-
-  public abstract BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path filePath);
-
-  public abstract String[] readMinMaxRecordKeys(Configuration configuration, Path filePath);
+  /**
+   * Read the min and max record key from the metadata of the given data file.
+   * @param configuration Configuration
+   * @param filePath The data file path
+   * @return A array of two string where the first is min record key and the second is max record key
+   */
+  public String[] readMinMaxRecordKeys(Configuration configuration, Path filePath) {
+    Map<String, String> minMaxKeys = readFooter(configuration, true, filePath,
+        HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
+    if (minMaxKeys.size() != 2) {
+      throw new HoodieException(
+          String.format("Could not read min/max record key out of footer correctly from %s. read) : %s",
+              filePath, minMaxKeys));
+    }
+    return new String[] {minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER),
+        minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)};
+  }

+  /**
+   * Read the data file
+   * NOTE: This literally reads the entire file contents, thus should be used with caution.
+   * @param configuration Configuration
+   * @param filePath The data file path
+   * @return A list of GenericRecord
+   */
  public abstract List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath);

+  /**
+   * Read the data file using the given schema
+   * NOTE: This literally reads the entire file contents, thus should be used with caution.
+   * @param configuration Configuration
+   * @param filePath The data file path
+   * @return A list of GenericRecord
+   */
  public abstract List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema schema);

-  public abstract Map<String, String> readFooter(Configuration conf, boolean required, Path orcFilePath,
-      String... footerNames);
+  /**
+   * Read the footer data of the given data file.
+   * @param configuration Configuration
+   * @param required require the footer data to be in data file
+   * @param filePath The data file path
+   * @param footerNames The footer names to read
+   * @return A map where the key is the footer name and the value is the footer value
+   */
+  public abstract Map<String, String> readFooter(Configuration configuration, boolean required, Path filePath,
+                                                 String... footerNames);

-  public abstract long getRowCount(Configuration conf, Path filePath);
-}
+  /**
+   * Returns the number of records in the data file.
+   * @param configuration Configuration
+   * @param filePath The data file path
+   */
+  public abstract long getRowCount(Configuration configuration, Path filePath);
+
+  /**
+   * Read the rowKey list matching the given filter, from the given data file.
+   * If the filter is empty, then this will return all the row keys.
+   * @param filePath      The data file path
+   * @param configuration configuration to build fs object
+   * @param filter        record keys filter
+   * @return Set Set of row keys matching candidateRecordKeys
+   */
+  public abstract Set<String> filterRowKeys(Configuration configuration, Path filePath, Set<String> filter);
+
+  /**
+   * Fetch {@link HoodieKey}s from the given data file.
+   * @param configuration configuration to build fs object
+   * @param filePath      The data file path
+   * @return {@link List} of {@link HoodieKey}s fetched from the parquet file
+   */
+  public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
+
+  /**
+   * Read the Avro schema of the data file.
+   * @param configuration Configuration
+   * @param filePath The data file path
+   * @return The Avro schema of the data file
+   */
+  public abstract Schema readAvroSchema(Configuration configuration, Path filePath);
+}
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import java.util.List;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * This class wraps a ORC reader and provides an iterator based api to read from an ORC file.
+ */
+public class OrcReaderIterator<T> implements Iterator<T> {
+
+  private final RecordReader recordReader;
+  private final Schema avroSchema;
+  List<String> fieldNames;
+  List<TypeDescription> orcFieldTypes;
+  Schema[] avroFieldSchemas;
+  private VectorizedRowBatch batch;
+  private int rowInBatch;
+  private T next;
+
+  public OrcReaderIterator(RecordReader recordReader, Schema schema, TypeDescription orcSchema) {
+    this.recordReader = recordReader;
+    this.avroSchema = schema;
+    this.fieldNames = orcSchema.getFieldNames();
+    this.orcFieldTypes = orcSchema.getChildren();
+    this.avroFieldSchemas = fieldNames.stream()
+        .map(fieldName -> avroSchema.getField(fieldName).schema())
+        .toArray(size -> new Schema[size]);
+    this.batch = orcSchema.createRowBatch();
+    this.rowInBatch = 0;
+  }
+
+  /**
+   * If the current batch is empty, get a new one.
+   * @return true if we have rows available.
+   * @throws IOException
+   */
+  private boolean ensureBatch() throws IOException {
+    if (rowInBatch >= batch.size) {
+      rowInBatch = 0;
+      return recordReader.nextBatch(batch);
+    }
+    return true;
+  }
+
+  @Override
+  public boolean hasNext() {
+    try {
+      ensureBatch();
+      if (this.next == null) {
+        this.next = (T) readRecordFromBatch();
+      }
+      return this.next != null;
+    } catch (IOException io) {
+      throw new HoodieIOException("unable to read next record from ORC file ", io);
+    }
+  }
+
+  @Override
+  public T next() {
+    try {
+      // To handle case when next() is called before hasNext()
+      if (this.next == null) {
+        if (!hasNext()) {
+          throw new HoodieIOException("No more records left to read from ORC file");
+        }
+      }
+      T retVal = this.next;
+      this.next = (T) readRecordFromBatch();
+      return retVal;
+    } catch (IOException io) {
+      throw new HoodieIOException("unable to read next record from ORC file ", io);
+    }
+  }
+
+  private GenericData.Record readRecordFromBatch() throws IOException {
+    // No more records left to read from ORC file
+    if (!ensureBatch()) {
+      return null;
+    }
+
+    GenericData.Record record = new Record(avroSchema);
+    int numFields = orcFieldTypes.size();
+    for (int i = 0; i < numFields; i++) {
+      Object data = AvroOrcUtils.readFromVector(orcFieldTypes.get(i), batch.cols[i], avroFieldSchemas[i], rowInBatch);
+      record.put(fieldNames.get(i), data);
+    }
+    rowInBatch++;
+    return record;
+  }
+}
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
+import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.exception.MetadataNotFoundException;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto.UserMetadataItem;
+import org.apache.orc.Reader;
+import org.apache.orc.Reader.Options;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+/**
+ * Utility functions for ORC files.
+ */
+public class OrcUtils extends BaseFileUtils {
+
+  /**
+   * Fetch {@link HoodieKey}s from the given ORC file.
+   *
+   * @param filePath      The ORC file path.
+   * @param configuration configuration to build fs object
+   * @return {@link List} of {@link HoodieKey}s fetched from the ORC file
+   */
+  @Override
+  public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) {
+    List<HoodieKey> hoodieKeys = new ArrayList<>();
+    try {
+      if (!filePath.getFileSystem(configuration).exists(filePath)) {
+        return new ArrayList<>();
+      }
+
+      Configuration conf = new Configuration(configuration);
+      conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
+      Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
+
+      Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
+      TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema);
+      List<String> fieldNames = orcSchema.getFieldNames();
+      VectorizedRowBatch batch = orcSchema.createRowBatch();
+      RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
+
+      // column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields
+      int keyCol = -1;
+      int partitionCol = -1;
+      for (int i = 0; i < fieldNames.size(); i++) {
+        if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
+          keyCol = i;
+        }
+        if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) {
+          partitionCol = i;
+        }
+      }
+      if (keyCol == -1 || partitionCol == -1) {
+        throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath));
+      }
+      while (recordReader.nextBatch(batch)) {
+        BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[keyCol];
+        BytesColumnVector partitionPaths = (BytesColumnVector) batch.cols[partitionCol];
+        for (int i = 0; i < batch.size; i++) {
+          String rowKey = rowKeys.toString(i);
+          String partitionPath = partitionPaths.toString(i);
+          hoodieKeys.add(new HoodieKey(rowKey, partitionPath));
+        }
+      }
+    } catch (IOException e) {
+      throw new HoodieIOException("Failed to read from ORC file:" + filePath, e);
+    }
+    return hoodieKeys;
+  }
+
+  /**
+   * NOTE: This literally reads the entire file contents, thus should be used with caution.
+   */
+  @Override
+  public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath) {
+    Schema avroSchema;
+    try {
+      Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration));
+      avroSchema = AvroOrcUtils.createAvroSchema(reader.getSchema());
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to read Avro records from an ORC file:" + filePath, io);
+    }
+    return readAvroRecords(configuration, filePath, avroSchema);
+  }
+
+  /**
+   * NOTE: This literally reads the entire file contents, thus should be used with caution.
+   */
+  @Override
+  public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) {
+    List<GenericRecord> records = new ArrayList<>();
+    try {
+      Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration));
+      TypeDescription orcSchema = reader.getSchema();
+      RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema));
+      OrcReaderIterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
+      while (iterator.hasNext()) {
+        GenericRecord record = iterator.next();
+        records.add(record);
+      }
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io);
+    }
+    return records;
+  }
+
+  /**
+   * Read the rowKey list matching the given filter, from the given ORC file. If the filter is empty, then this will
+   * return all the rowkeys.
+   *
+   * @param conf configuration to build fs object.
+   * @param filePath      The ORC file path.
+   * @param filter        record keys filter
+   * @return Set Set of row keys matching candidateRecordKeys
+   */
+  @Override
+  public Set<String> filterRowKeys(Configuration conf, Path filePath, Set<String> filter)
+      throws HoodieIOException {
+    try {
+      Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
+      Set<String> filteredRowKeys = new HashSet<>();
+      TypeDescription schema = reader.getSchema();
+      List<String> fieldNames = schema.getFieldNames();
+      VectorizedRowBatch batch = schema.createRowBatch();
+      RecordReader recordReader = reader.rows(new Options(conf).schema(schema));
+
+      // column index for the RECORD_KEY_METADATA_FIELD field
+      int colIndex = -1;
+      for (int i = 0; i < fieldNames.size(); i++) {
+        if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
+          colIndex = i;
+          break;
+        }
+      }
+      if (colIndex == -1) {
+        throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath));
+      }
+      while (recordReader.nextBatch(batch)) {
+        BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex];
+        for (int i = 0; i < batch.size; i++) {
+          String rowKey = rowKeys.toString(i);
+          if (filter.isEmpty() || filter.contains(rowKey)) {
+            filteredRowKeys.add(rowKey);
+          }
+        }
+      }
+      return filteredRowKeys;
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to read row keys for ORC file:" + filePath, io);
+    }
+  }
+
+  @Override
+  public Map<String, String> readFooter(Configuration conf, boolean required,
+                                        Path orcFilePath, String... footerNames) {
+    try {
+      Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
+      Map<String, String> footerVals = new HashMap<>();
+      List<UserMetadataItem> metadataItemList = reader.getFileTail().getFooter().getMetadataList();
+      Map<String, String> metadata = metadataItemList.stream().collect(Collectors.toMap(
+          UserMetadataItem::getName,
+          metadataItem -> metadataItem.getValue().toStringUtf8()));
+      for (String footerName : footerNames) {
+        if (metadata.containsKey(footerName)) {
+          footerVals.put(footerName, metadata.get(footerName));
+        } else if (required) {
+          throw new MetadataNotFoundException(
+              "Could not find index in ORC footer. Looked for key " + footerName + " in "
+                  + orcFilePath);
+        }
+      }
+      return footerVals;
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to read footer for ORC file:" + orcFilePath, io);
+    }
+  }
+
+  @Override
+  public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
+    try {
+      Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
+      TypeDescription orcSchema = reader.getSchema();
+      return AvroOrcUtils.createAvroSchema(orcSchema);
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
+    }
+  }
+
+  @Override
+  public long getRowCount(Configuration conf, Path orcFilePath) {
+    try {
+      Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
+      return reader.getNumberOfRows();
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to get row count for ORC file:" + orcFilePath, io);
+    }
+  }
+}
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java
@@ -19,14 +19,9 @@
 package org.apache.hudi.common.util;

 import org.apache.hudi.avro.HoodieAvroUtils;
-import org.apache.hudi.avro.HoodieAvroWriteSupport;
-import org.apache.hudi.common.bloom.BloomFilter;
-import org.apache.hudi.common.bloom.BloomFilterFactory;
-import org.apache.hudi.common.bloom.BloomFilterTypeCode;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.exception.HoodieIOException;
 import org.apache.hudi.exception.MetadataNotFoundException;

@@ -57,18 +52,6 @@ import java.util.function.Function;
 */
 public class ParquetUtils extends BaseFileUtils {

-  /**
-   * Read the rowKey list from the given parquet file.
-   *
-   * @param filePath      The parquet file path.
-   * @param configuration configuration to build fs object
-   * @return Set Set of row keys
-   */
-  @Override
-  public Set<String> readRowKeys(Configuration configuration, Path filePath) {
-    return filterRowKeys(configuration, filePath, new HashSet<>());
-  }
-
  /**
   * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
   * return all the rowkeys.
@@ -196,47 +179,8 @@ public class ParquetUtils extends BaseFileUtils {

  @Override
  public Schema readAvroSchema(Configuration configuration, Path parquetFilePath) {
-    return new AvroSchemaConverter(configuration).convert(readSchema(configuration, parquetFilePath));
-  }
-
-  /**
-   * Read out the bloom filter from the parquet file meta data.
-   */
-  @Override
-  public BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path parquetFilePath) {
-    Map<String, String> footerVals =
-        readFooter(configuration, false, parquetFilePath,
-            HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
-            HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
-            HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE);
-    String footerVal = footerVals.get(HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
-    if (null == footerVal) {
-      // We use old style key "com.uber.hoodie.bloomfilter"
-      footerVal = footerVals.get(HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
-    }
-    BloomFilter toReturn = null;
-    if (footerVal != null) {
-      if (footerVals.containsKey(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) {
-        toReturn = BloomFilterFactory.fromString(footerVal,
-            footerVals.get(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE));
-      } else {
-        toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name());
-      }
-    }
-    return toReturn;
-  }
-
-  @Override
-  public String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) {
-    Map<String, String> minMaxKeys = readFooter(configuration, true, parquetFilePath,
-        HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
-    if (minMaxKeys.size() != 2) {
-      throw new HoodieException(
-          String.format("Could not read min/max record key out of footer correctly from %s. read) : %s",
-              parquetFilePath, minMaxKeys));
-    }
-    return new String[] {minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER),
-        minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)};
+    MessageType parquetSchema = readSchema(configuration, parquetFilePath);
+    return new AvroSchemaConverter(configuration).convert(parquetSchema);
  }

  /**
--- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java
+++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.io.hfile.CacheConfig;

 import java.io.IOException;

+import static org.apache.hudi.common.model.HoodieFileFormat.ORC;
 import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
 import static org.apache.hudi.common.model.HoodieFileFormat.HFILE;

@@ -40,6 +41,9 @@ public class HoodieFileReaderFactory {
    if (HFILE.getFileExtension().equals(extension)) {
      return newHFileFileReader(conf, path);
    }
+    if (ORC.getFileExtension().equals(extension)) {
+      return newOrcFileReader(conf, path);
+    }

    throw new UnsupportedOperationException(extension + " format not supported yet.");
  }
@@ -52,4 +56,8 @@ public class HoodieFileReaderFactory {
    CacheConfig cacheConfig = new CacheConfig(conf);
    return new HoodieHFileReader<>(conf, path, cacheConfig);
  }
+
+  private static <R extends IndexedRecord> HoodieFileReader<R> newOrcFileReader(Configuration conf, Path path) {
+    return new HoodieOrcReader<>(conf, path);
+  }
 }
--- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcReader.java
+++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcReader.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Set;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.bloom.BloomFilter;
+import org.apache.hudi.common.model.HoodieFileFormat;
+import org.apache.hudi.common.util.AvroOrcUtils;
+import org.apache.hudi.common.util.BaseFileUtils;
+import org.apache.hudi.common.util.OrcReaderIterator;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.Reader.Options;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+public class HoodieOrcReader<R extends IndexedRecord> implements HoodieFileReader {
+  private Path path;
+  private Configuration conf;
+  private final BaseFileUtils orcUtils;
+
+  public HoodieOrcReader(Configuration configuration, Path path) {
+    this.conf = configuration;
+    this.path = path;
+    this.orcUtils = BaseFileUtils.getInstance(HoodieFileFormat.ORC);
+  }
+
+  @Override
+  public String[] readMinMaxRecordKeys() {
+    return orcUtils.readMinMaxRecordKeys(conf, path);
+  }
+
+  @Override
+  public BloomFilter readBloomFilter() {
+    return orcUtils.readBloomFilterFromMetadata(conf, path);
+  }
+
+  @Override
+  public Set<String> filterRowKeys(Set candidateRowKeys) {
+    return orcUtils.filterRowKeys(conf, path, candidateRowKeys);
+  }
+
+  @Override
+  public Iterator<R> getRecordIterator(Schema schema) throws IOException {
+    try {
+      Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+      TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema);
+      RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
+      return new OrcReaderIterator(recordReader, schema, orcSchema);
+    } catch (IOException io) {
+      throw new HoodieIOException("Unable to create an ORC reader.", io);
+    }
+  }
+
+  @Override
+  public Schema getSchema() {
+    return orcUtils.readAvroSchema(conf, path);
+  }
+
+  @Override
+  public void close() {
+  }
+
+  @Override
+  public long getTotalRecords() {
+    return orcUtils.getRowCount(conf, path);
+  }
+}