1
0

[HUDI-764] [HUDI-765] ORC reader writer Implementation (#2999)

Co-authored-by: Qingyun (Teresa) Kang <kteresa@uber.com>
This commit is contained in:
Jintao Guan
2021-06-15 15:21:43 -07:00
committed by GitHub
parent cb642ceb75
commit b8fe5b91d5
29 changed files with 2268 additions and 91 deletions

View File

@@ -24,7 +24,8 @@ package org.apache.hudi.common.model;
public enum HoodieFileFormat {
PARQUET(".parquet"),
HOODIE_LOG(".log"),
HFILE(".hfile");
HFILE(".hfile"),
ORC(".orc");
private final String extension;

View File

@@ -0,0 +1,799 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.sql.Timestamp;
import java.util.Base64;
import java.util.Date;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.avro.Conversions;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericData;
import java.nio.charset.StandardCharsets;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData.StringType;
import org.apache.avro.util.Utf8;
import org.apache.orc.storage.common.type.HiveDecimal;
import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
import org.apache.orc.storage.ql.exec.vector.ColumnVector;
import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector;
import org.apache.orc.storage.ql.exec.vector.DoubleColumnVector;
import org.apache.orc.storage.ql.exec.vector.ListColumnVector;
import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
import org.apache.orc.storage.ql.exec.vector.MapColumnVector;
import org.apache.orc.storage.ql.exec.vector.StructColumnVector;
import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector;
import org.apache.orc.storage.ql.exec.vector.UnionColumnVector;
import org.apache.orc.storage.serde2.io.DateWritable;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.orc.TypeDescription;
/**
* Methods including addToVector, addUnionValue, createOrcSchema are originally from
* https://github.com/streamsets/datacollector.
* Source classes:
* - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcRecordConverter
* - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcSchemaConverter
*
* Changes made:
* 1. Flatten nullable Avro schema type when the value is not null in `addToVector`.
* 2. Use getLogicalType(), constants from LogicalTypes instead of getJsonProp() to handle Avro logical types.
*/
public class AvroOrcUtils {
private static final int MICROS_PER_MILLI = 1000;
private static final int NANOS_PER_MICRO = 1000;
/**
* Add an object (of a given ORC type) to the column vector at a given position.
*
* @param type ORC schema of the value Object.
* @param colVector The column vector to store the value Object.
* @param avroSchema Avro schema of the value Object.
* Only used to check logical types for timestamp unit conversion.
* @param value Object to be added to the column vector
* @param vectorPos The position in the vector where value will be stored at.
*/
public static void addToVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, Object value, int vectorPos) {
final int currentVecLength = colVector.isNull.length;
if (vectorPos >= currentVecLength) {
colVector.ensureSize(2 * currentVecLength, true);
}
if (value == null) {
colVector.isNull[vectorPos] = true;
colVector.noNulls = false;
return;
}
if (avroSchema.getType().equals(Schema.Type.UNION)) {
avroSchema = getActualSchemaType(avroSchema);
}
LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
switch (type.getCategory()) {
case BOOLEAN:
LongColumnVector boolVec = (LongColumnVector) colVector;
boolVec.vector[vectorPos] = (boolean) value ? 1 : 0;
break;
case BYTE:
LongColumnVector byteColVec = (LongColumnVector) colVector;
byteColVec.vector[vectorPos] = (byte) value;
break;
case SHORT:
LongColumnVector shortColVec = (LongColumnVector) colVector;
shortColVec.vector[vectorPos] = (short) value;
break;
case INT:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but we will ignore that fact here
// since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the int value
LongColumnVector intColVec = (LongColumnVector) colVector;
intColVec.vector[vectorPos] = (int) value;
break;
case LONG:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but we will ignore that fact here
// since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the long value
LongColumnVector longColVec = (LongColumnVector) colVector;
longColVec.vector[vectorPos] = (long) value;
break;
case FLOAT:
DoubleColumnVector floatColVec = (DoubleColumnVector) colVector;
floatColVec.vector[vectorPos] = (float) value;
break;
case DOUBLE:
DoubleColumnVector doubleColVec = (DoubleColumnVector) colVector;
doubleColVec.vector[vectorPos] = (double) value;
break;
case VARCHAR:
case CHAR:
case STRING:
BytesColumnVector bytesColVec = (BytesColumnVector) colVector;
byte[] bytes = null;
if (value instanceof String) {
bytes = ((String) value).getBytes(StandardCharsets.UTF_8);
} else if (value instanceof Utf8) {
final Utf8 utf8 = (Utf8) value;
bytes = utf8.getBytes();
} else if (value instanceof GenericData.EnumSymbol) {
bytes = ((GenericData.EnumSymbol) value).toString().getBytes(StandardCharsets.UTF_8);
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro %s field value, which has type %s, value %s",
type.getCategory().getName(),
value.getClass().getName(),
value.toString()
));
}
if (bytes == null) {
bytesColVec.isNull[vectorPos] = true;
bytesColVec.noNulls = false;
} else {
bytesColVec.setRef(vectorPos, bytes, 0, bytes.length);
}
break;
case DATE:
LongColumnVector dateColVec = (LongColumnVector) colVector;
int daysSinceEpoch;
if (logicalType instanceof LogicalTypes.Date) {
daysSinceEpoch = (int) value;
} else if (value instanceof java.sql.Date) {
daysSinceEpoch = DateWritable.dateToDays((java.sql.Date) value);
} else if (value instanceof Date) {
daysSinceEpoch = DateWritable.millisToDays(((Date) value).getTime());
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro DATE field value, which has type %s, value %s",
value.getClass().getName(),
value.toString()
));
}
dateColVec.vector[vectorPos] = daysSinceEpoch;
break;
case TIMESTAMP:
TimestampColumnVector tsColVec = (TimestampColumnVector) colVector;
long time;
int nanos = 0;
// The unit for Timestamp in ORC is millis, convert timestamp to millis if needed
if (logicalType instanceof LogicalTypes.TimestampMillis) {
time = (long) value;
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
final long logicalTsValue = (long) value;
time = logicalTsValue / MICROS_PER_MILLI;
nanos = NANOS_PER_MICRO * ((int) (logicalTsValue % MICROS_PER_MILLI));
} else if (value instanceof Timestamp) {
Timestamp tsValue = (Timestamp) value;
time = tsValue.getTime();
nanos = tsValue.getNanos();
} else if (value instanceof java.sql.Date) {
java.sql.Date sqlDateValue = (java.sql.Date) value;
time = sqlDateValue.getTime();
} else if (value instanceof Date) {
Date dateValue = (Date) value;
time = dateValue.getTime();
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro TIMESTAMP field value, which has type %s, value %s",
value.getClass().getName(),
value.toString()
));
}
tsColVec.time[vectorPos] = time;
tsColVec.nanos[vectorPos] = nanos;
break;
case BINARY:
BytesColumnVector binaryColVec = (BytesColumnVector) colVector;
byte[] binaryBytes;
if (value instanceof GenericData.Fixed) {
binaryBytes = ((GenericData.Fixed)value).bytes();
} else if (value instanceof ByteBuffer) {
final ByteBuffer byteBuffer = (ByteBuffer) value;
binaryBytes = new byte[byteBuffer.remaining()];
byteBuffer.get(binaryBytes);
} else if (value instanceof byte[]) {
binaryBytes = (byte[]) value;
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro BINARY field value, which has type %s, value %s",
value.getClass().getName(),
value.toString()
));
}
binaryColVec.setRef(vectorPos, binaryBytes, 0, binaryBytes.length);
break;
case DECIMAL:
DecimalColumnVector decimalColVec = (DecimalColumnVector) colVector;
HiveDecimal decimalValue;
if (value instanceof BigDecimal) {
final BigDecimal decimal = (BigDecimal) value;
decimalValue = HiveDecimal.create(decimal);
} else if (value instanceof ByteBuffer) {
final ByteBuffer byteBuffer = (ByteBuffer) value;
final byte[] decimalBytes = new byte[byteBuffer.remaining()];
byteBuffer.get(decimalBytes);
final BigInteger bigInt = new BigInteger(decimalBytes);
final int scale = type.getScale();
BigDecimal bigDecVal = new BigDecimal(bigInt, scale);
decimalValue = HiveDecimal.create(bigDecVal);
if (decimalValue == null && decimalBytes.length > 0) {
throw new IllegalStateException(
"Unexpected read null HiveDecimal from bytes (base-64 encoded): "
+ Base64.getEncoder().encodeToString(decimalBytes)
);
}
} else if (value instanceof GenericData.Fixed) {
final BigDecimal decimal = new Conversions.DecimalConversion()
.fromFixed((GenericData.Fixed) value, avroSchema, logicalType);
decimalValue = HiveDecimal.create(decimal);
} else {
throw new IllegalStateException(String.format(
"Unexpected type for decimal (%s), cannot convert from Avro value",
value.getClass().getCanonicalName()
));
}
if (decimalValue == null) {
decimalColVec.isNull[vectorPos] = true;
decimalColVec.noNulls = false;
} else {
decimalColVec.set(vectorPos, decimalValue);
}
break;
case LIST:
List<?> list = (List<?>) value;
ListColumnVector listColVec = (ListColumnVector) colVector;
listColVec.offsets[vectorPos] = listColVec.childCount;
listColVec.lengths[vectorPos] = list.size();
TypeDescription listType = type.getChildren().get(0);
for (Object listItem : list) {
addToVector(listType, listColVec.child, avroSchema.getElementType(), listItem, listColVec.childCount++);
}
break;
case MAP:
Map<String, ?> mapValue = (Map<String, ?>) value;
MapColumnVector mapColumnVector = (MapColumnVector) colVector;
mapColumnVector.offsets[vectorPos] = mapColumnVector.childCount;
mapColumnVector.lengths[vectorPos] = mapValue.size();
// keys are always strings
Schema keySchema = Schema.create(Schema.Type.STRING);
for (Map.Entry<String, ?> entry : mapValue.entrySet()) {
addToVector(
type.getChildren().get(0),
mapColumnVector.keys,
keySchema,
entry.getKey(),
mapColumnVector.childCount
);
addToVector(
type.getChildren().get(1),
mapColumnVector.values,
avroSchema.getValueType(),
entry.getValue(),
mapColumnVector.childCount
);
mapColumnVector.childCount++;
}
break;
case STRUCT:
StructColumnVector structColVec = (StructColumnVector) colVector;
GenericData.Record record = (GenericData.Record) value;
for (int i = 0; i < type.getFieldNames().size(); i++) {
String fieldName = type.getFieldNames().get(i);
Object fieldValue = record.get(fieldName);
TypeDescription fieldType = type.getChildren().get(i);
addToVector(fieldType, structColVec.fields[i], avroSchema.getFields().get(i).schema(), fieldValue, vectorPos);
}
break;
case UNION:
UnionColumnVector unionColVec = (UnionColumnVector) colVector;
List<TypeDescription> childTypes = type.getChildren();
boolean added = addUnionValue(unionColVec, childTypes, avroSchema, value, vectorPos);
if (!added) {
throw new IllegalStateException(String.format(
"Failed to add value %s to union with type %s",
value == null ? "null" : value.toString(),
type.toString()
));
}
break;
default:
throw new IllegalArgumentException("Invalid TypeDescription " + type.toString() + ".");
}
}
/**
* Match value with its ORC type and add to the union vector at a given position.
*
* @param unionVector The vector to store value.
* @param unionChildTypes All possible types for the value Object.
* @param avroSchema Avro union schema for the value Object.
* @param value Object to be added to the unionVector
* @param vectorPos The position in the vector where value will be stored at.
* @return succeeded or failed
*/
public static boolean addUnionValue(
UnionColumnVector unionVector,
List<TypeDescription> unionChildTypes,
Schema avroSchema,
Object value,
int vectorPos
) {
int matchIndex = -1;
TypeDescription matchType = null;
Object matchValue = null;
for (int t = 0; t < unionChildTypes.size(); t++) {
TypeDescription childType = unionChildTypes.get(t);
boolean matches = false;
switch (childType.getCategory()) {
case BOOLEAN:
matches = value instanceof Boolean;
break;
case BYTE:
matches = value instanceof Byte;
break;
case SHORT:
matches = value instanceof Short;
break;
case INT:
matches = value instanceof Integer;
break;
case LONG:
matches = value instanceof Long;
break;
case FLOAT:
matches = value instanceof Float;
break;
case DOUBLE:
matches = value instanceof Double;
break;
case STRING:
case VARCHAR:
case CHAR:
if (value instanceof String) {
matches = true;
matchValue = ((String) value).getBytes(StandardCharsets.UTF_8);
} else if (value instanceof Utf8) {
matches = true;
matchValue = ((Utf8) value).getBytes();
}
break;
case DATE:
matches = value instanceof Date;
break;
case TIMESTAMP:
matches = value instanceof Timestamp;
break;
case BINARY:
matches = value instanceof byte[] || value instanceof GenericData.Fixed;
break;
case DECIMAL:
matches = value instanceof BigDecimal;
break;
case LIST:
matches = value instanceof List;
break;
case MAP:
matches = value instanceof Map;
break;
case STRUCT:
throw new UnsupportedOperationException("Cannot handle STRUCT within UNION.");
case UNION:
List<TypeDescription> children = childType.getChildren();
if (value == null) {
matches = children == null || children.size() == 0;
} else {
matches = addUnionValue(unionVector, children, avroSchema, value, vectorPos);
}
break;
default:
throw new IllegalArgumentException("Invalid TypeDescription " + childType.getCategory().toString() + ".");
}
if (matches) {
matchIndex = t;
matchType = childType;
break;
}
}
if (value == null && matchValue != null) {
value = matchValue;
}
if (matchIndex >= 0) {
unionVector.tags[vectorPos] = matchIndex;
if (value == null) {
unionVector.isNull[vectorPos] = true;
unionVector.noNulls = false;
} else {
addToVector(matchType, unionVector.fields[matchIndex], avroSchema.getTypes().get(matchIndex), value, vectorPos);
}
return true;
} else {
return false;
}
}
/**
* Read the Column vector at a given position conforming to a given ORC schema.
*
* @param type ORC schema of the object to read.
* @param colVector The column vector to read.
* @param avroSchema Avro schema of the object to read.
* Only used to check logical types for timestamp unit conversion.
* @param vectorPos The position in the vector where the value to read is stored at.
* @return The object being read.
*/
public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, int vectorPos) {
if (colVector.isRepeating) {
vectorPos = 0;
}
if (colVector.isNull[vectorPos]) {
return null;
}
if (avroSchema.getType().equals(Schema.Type.UNION)) {
avroSchema = getActualSchemaType(avroSchema);
}
LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
switch (type.getCategory()) {
case BOOLEAN:
return ((LongColumnVector) colVector).vector[vectorPos] != 0;
case BYTE:
return (byte) ((LongColumnVector) colVector).vector[vectorPos];
case SHORT:
return (short) ((LongColumnVector) colVector).vector[vectorPos];
case INT:
return (int) ((LongColumnVector) colVector).vector[vectorPos];
case LONG:
return ((LongColumnVector) colVector).vector[vectorPos];
case FLOAT:
return (float) ((DoubleColumnVector) colVector).vector[vectorPos];
case DOUBLE:
return ((DoubleColumnVector) colVector).vector[vectorPos];
case VARCHAR:
case CHAR:
int maxLength = type.getMaxLength();
String result = ((BytesColumnVector) colVector).toString(vectorPos);
if (result.length() <= maxLength) {
return result;
} else {
throw new HoodieIOException("CHAR/VARCHAR has length " + result.length() + " greater than Max Length allowed");
}
case STRING:
String stringType = avroSchema.getProp(GenericData.STRING_PROP);
if (stringType == null || !stringType.equals(StringType.String)) {
int stringLength = ((BytesColumnVector) colVector).length[vectorPos];
int stringOffset = ((BytesColumnVector) colVector).start[vectorPos];
byte[] stringBytes = new byte[stringLength];
System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], stringOffset, stringBytes, 0, stringLength);
return new Utf8(stringBytes);
} else {
return ((BytesColumnVector) colVector).toString(vectorPos);
}
case DATE:
// convert to daysSinceEpoch for LogicalType.Date
return (int) ((LongColumnVector) colVector).vector[vectorPos];
case TIMESTAMP:
// The unit of time in ORC is millis. Convert (time,nanos) to the desired unit per logicalType
long time = ((TimestampColumnVector) colVector).time[vectorPos];
int nanos = ((TimestampColumnVector) colVector).nanos[vectorPos];
if (logicalType instanceof LogicalTypes.TimestampMillis) {
return time;
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
return time * MICROS_PER_MILLI + nanos / NANOS_PER_MICRO;
} else {
return ((TimestampColumnVector) colVector).getTimestampAsLong(vectorPos);
}
case BINARY:
int binaryLength = ((BytesColumnVector) colVector).length[vectorPos];
int binaryOffset = ((BytesColumnVector) colVector).start[vectorPos];
byte[] binaryBytes = new byte[binaryLength];
System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], binaryOffset, binaryBytes, 0, binaryLength);
// return a ByteBuffer to be consistent with AvroRecordConverter
return ByteBuffer.wrap(binaryBytes);
case DECIMAL:
// HiveDecimal always ignores trailing zeros, thus modifies the scale implicitly,
// therefore, the scale must be enforced here.
BigDecimal bigDecimal = ((DecimalColumnVector) colVector).vector[vectorPos]
.getHiveDecimal().bigDecimalValue()
.setScale(((LogicalTypes.Decimal) logicalType).getScale());
Schema.Type baseType = avroSchema.getType();
if (baseType.equals(Schema.Type.FIXED)) {
return new Conversions.DecimalConversion().toFixed(bigDecimal, avroSchema, logicalType);
} else if (baseType.equals(Schema.Type.BYTES)) {
return bigDecimal.unscaledValue().toByteArray();
} else {
throw new HoodieIOException(baseType.getName() + "is not a valid type for LogicalTypes.DECIMAL.");
}
case LIST:
ArrayList<Object> list = new ArrayList<>();
ListColumnVector listVector = (ListColumnVector) colVector;
int listLength = (int) listVector.lengths[vectorPos];
int listOffset = (int) listVector.offsets[vectorPos];
list.ensureCapacity(listLength);
TypeDescription childType = type.getChildren().get(0);
for (int i = 0; i < listLength; i++) {
list.add(readFromVector(childType, listVector.child, avroSchema.getElementType(), listOffset + i));
}
return list;
case MAP:
Map<String, Object> map = new HashMap<String, Object>();
MapColumnVector mapVector = (MapColumnVector) colVector;
int mapLength = (int) mapVector.lengths[vectorPos];
int mapOffset = (int) mapVector.offsets[vectorPos];
// keys are always strings for maps in Avro
Schema keySchema = Schema.create(Schema.Type.STRING);
for (int i = 0; i < mapLength; i++) {
map.put(
readFromVector(type.getChildren().get(0), mapVector.keys, keySchema, i + mapOffset).toString(),
readFromVector(type.getChildren().get(1), mapVector.values,
avroSchema.getValueType(), i + mapOffset));
}
return map;
case STRUCT:
StructColumnVector structVector = (StructColumnVector) colVector;
List<TypeDescription> children = type.getChildren();
GenericData.Record record = new GenericData.Record(avroSchema);
for (int i = 0; i < children.size(); i++) {
record.put(i, readFromVector(children.get(i), structVector.fields[i],
avroSchema.getFields().get(i).schema(), vectorPos));
}
return record;
case UNION:
UnionColumnVector unionVector = (UnionColumnVector) colVector;
int tag = unionVector.tags[vectorPos];
ColumnVector fieldVector = unionVector.fields[tag];
return readFromVector(type.getChildren().get(tag), fieldVector, avroSchema.getTypes().get(tag), vectorPos);
default:
throw new HoodieIOException("Unrecognized TypeDescription " + type.toString());
}
}
public static TypeDescription createOrcSchema(Schema avroSchema) {
LogicalType logicalType = avroSchema.getLogicalType();
if (logicalType != null) {
if (logicalType instanceof LogicalTypes.Decimal) {
return TypeDescription.createDecimal()
.withPrecision(((LogicalTypes.Decimal) logicalType).getPrecision())
.withScale(((LogicalTypes.Decimal) logicalType).getScale());
} else if (logicalType instanceof LogicalTypes.Date) {
// The date logical type represents a date within the calendar, with no reference to a particular time zone
// or time of day.
//
// A date logical type annotates an Avro int, where the int stores the number of days from the unix epoch, 1
// January 1970 (ISO calendar).
return TypeDescription.createDate();
} else if (logicalType instanceof LogicalTypes.TimeMillis) {
// The time-millis logical type represents a time of day, with no reference to a particular calendar, time
// zone or date, with a precision of one millisecond.
//
// A time-millis logical type annotates an Avro int, where the int stores the number of milliseconds after
// midnight, 00:00:00.000.
return TypeDescription.createInt();
} else if (logicalType instanceof LogicalTypes.TimeMicros) {
// The time-micros logical type represents a time of day, with no reference to a particular calendar, time
// zone or date, with a precision of one microsecond.
//
// A time-micros logical type annotates an Avro long, where the long stores the number of microseconds after
// midnight, 00:00:00.000000.
return TypeDescription.createLong();
} else if (logicalType instanceof LogicalTypes.TimestampMillis) {
// The timestamp-millis logical type represents an instant on the global timeline, independent of a
// particular time zone or calendar, with a precision of one millisecond.
//
// A timestamp-millis logical type annotates an Avro long, where the long stores the number of milliseconds
// from the unix epoch, 1 January 1970 00:00:00.000 UTC.
return TypeDescription.createTimestamp();
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
// The timestamp-micros logical type represents an instant on the global timeline, independent of a
// particular time zone or calendar, with a precision of one microsecond.
//
// A timestamp-micros logical type annotates an Avro long, where the long stores the number of microseconds
// from the unix epoch, 1 January 1970 00:00:00.000000 UTC.
return TypeDescription.createTimestamp();
}
}
final Schema.Type type = avroSchema.getType();
switch (type) {
case NULL:
// empty union represents null type
final TypeDescription nullUnion = TypeDescription.createUnion();
return nullUnion;
case LONG:
return TypeDescription.createLong();
case INT:
return TypeDescription.createInt();
case BYTES:
return TypeDescription.createBinary();
case ARRAY:
return TypeDescription.createList(createOrcSchema(avroSchema.getElementType()));
case RECORD:
final TypeDescription recordStruct = TypeDescription.createStruct();
for (Schema.Field field : avroSchema.getFields()) {
final Schema fieldSchema = field.schema();
final TypeDescription fieldType = createOrcSchema(fieldSchema);
if (fieldType != null) {
recordStruct.addField(field.name(), fieldType);
}
}
return recordStruct;
case MAP:
return TypeDescription.createMap(
// in Avro maps, keys are always strings
TypeDescription.createString(),
createOrcSchema(avroSchema.getValueType())
);
case UNION:
final List<Schema> nonNullMembers = avroSchema.getTypes().stream().filter(
schema -> !Schema.Type.NULL.equals(schema.getType())
).collect(Collectors.toList());
if (nonNullMembers.isEmpty()) {
// no non-null union members; represent as an ORC empty union
return TypeDescription.createUnion();
} else if (nonNullMembers.size() == 1) {
// a single non-null union member
// this is how Avro represents "nullable" types; as a union of the NULL type with another
// since ORC already supports nullability of all types, just use the child type directly
return createOrcSchema(nonNullMembers.get(0));
} else {
// more than one non-null type; represent as an actual ORC union of them
final TypeDescription union = TypeDescription.createUnion();
for (final Schema childSchema : nonNullMembers) {
union.addUnionChild(createOrcSchema(childSchema));
}
return union;
}
case STRING:
return TypeDescription.createString();
case FLOAT:
return TypeDescription.createFloat();
case DOUBLE:
return TypeDescription.createDouble();
case BOOLEAN:
return TypeDescription.createBoolean();
case ENUM:
// represent as String for now
return TypeDescription.createString();
case FIXED:
return TypeDescription.createBinary();
default:
throw new IllegalStateException(String.format("Unrecognized Avro type: %s", type.getName()));
}
}
public static Schema createAvroSchema(TypeDescription orcSchema) {
switch (orcSchema.getCategory()) {
case BOOLEAN:
return Schema.create(Schema.Type.BOOLEAN);
case BYTE:
// tinyint (8 bit), use int to hold it
return Schema.create(Schema.Type.INT);
case SHORT:
// smallint (16 bit), use int to hold it
return Schema.create(Schema.Type.INT);
case INT:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish
return Schema.create(Schema.Type.INT);
case LONG:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish
return Schema.create(Schema.Type.LONG);
case FLOAT:
return Schema.create(Schema.Type.FLOAT);
case DOUBLE:
return Schema.create(Schema.Type.DOUBLE);
case VARCHAR:
case CHAR:
case STRING:
return Schema.create(Schema.Type.STRING);
case DATE:
Schema date = Schema.create(Schema.Type.INT);
LogicalTypes.date().addToSchema(date);
return date;
case TIMESTAMP:
// Cannot distinguish between TIMESTAMP_MILLIS and TIMESTAMP_MICROS
// Assume TIMESTAMP_MILLIS because Timestamp in ORC is in millis
Schema timestamp = Schema.create(Schema.Type.LONG);
LogicalTypes.timestampMillis().addToSchema(timestamp);
return timestamp;
case BINARY:
return Schema.create(Schema.Type.BYTES);
case DECIMAL:
Schema decimal = Schema.create(Schema.Type.BYTES);
LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal);
return decimal;
case LIST:
return Schema.createArray(createAvroSchema(orcSchema.getChildren().get(0)));
case MAP:
return Schema.createMap(createAvroSchema(orcSchema.getChildren().get(1)));
case STRUCT:
List<Field> childFields = new ArrayList<>();
for (int i = 0; i < orcSchema.getChildren().size(); i++) {
TypeDescription childType = orcSchema.getChildren().get(i);
String childName = orcSchema.getFieldNames().get(i);
childFields.add(new Field(childName, createAvroSchema(childType), "", null));
}
return Schema.createRecord(childFields);
case UNION:
return Schema.createUnion(orcSchema.getChildren().stream()
.map(AvroOrcUtils::createAvroSchema)
.collect(Collectors.toList()));
default:
throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName()));
}
}
/**
* Returns the actual schema of a field.
*
* All types in ORC is nullable whereas Avro uses a union that contains the NULL type to imply
* the nullability of an Avro type. To achieve consistency between the Avro and ORC schema,
* non-NULL types are extracted from the union type.
* @param unionSchema A schema of union type.
* @return An Avro schema that is either NULL or a UNION without NULL fields.
*/
private static Schema getActualSchemaType(Schema unionSchema) {
final List<Schema> nonNullMembers = unionSchema.getTypes().stream().filter(
schema -> !Schema.Type.NULL.equals(schema.getType())
).collect(Collectors.toList());
if (nonNullMembers.isEmpty()) {
return Schema.create(Schema.Type.NULL);
} else if (nonNullMembers.size() == 1) {
return nonNullMembers.get(0);
} else {
return Schema.createUnion(nonNullMembers);
}
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.common.util;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -25,16 +26,22 @@ import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroWriteSupport;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.exception.HoodieException;
public abstract class BaseFileUtils {
public static BaseFileUtils getInstance(String path) {
if (path.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
return new ParquetUtils();
} else if (path.endsWith(HoodieFileFormat.ORC.getFileExtension())) {
return new OrcUtils();
}
throw new UnsupportedOperationException("The format for file " + path + " is not supported yet.");
}
@@ -42,6 +49,8 @@ public abstract class BaseFileUtils {
public static BaseFileUtils getInstance(HoodieFileFormat fileFormat) {
if (HoodieFileFormat.PARQUET.equals(fileFormat)) {
return new ParquetUtils();
} else if (HoodieFileFormat.ORC.equals(fileFormat)) {
return new OrcUtils();
}
throw new UnsupportedOperationException(fileFormat.name() + " format not supported yet.");
}
@@ -50,24 +59,122 @@ public abstract class BaseFileUtils {
return getInstance(metaClient.getTableConfig().getBaseFileFormat());
}
public abstract Set<String> readRowKeys(Configuration configuration, Path filePath);
/**
* Read the rowKey list from the given data file.
* @param filePath The data file path
* @param configuration configuration to build fs object
* @return Set Set of row keys
*/
public Set<String> readRowKeys(Configuration configuration, Path filePath) {
return filterRowKeys(configuration, filePath, new HashSet<>());
}
public abstract Set<String> filterRowKeys(Configuration configuration, Path filePath, Set<String> filter);
/**
* Read the bloom filter from the metadata of the given data file.
* @param configuration Configuration
* @param filePath The data file path
* @return a BloomFilter object
*/
public BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path filePath) {
Map<String, String> footerVals =
readFooter(configuration, false, filePath,
HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE);
String footerVal = footerVals.get(HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
if (null == footerVal) {
// We use old style key "com.uber.hoodie.bloomfilter"
footerVal = footerVals.get(HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
}
BloomFilter toReturn = null;
if (footerVal != null) {
if (footerVals.containsKey(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) {
toReturn = BloomFilterFactory.fromString(footerVal,
footerVals.get(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE));
} else {
toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name());
}
}
return toReturn;
}
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
public abstract Schema readAvroSchema(Configuration configuration, Path filePath);
public abstract BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path filePath);
public abstract String[] readMinMaxRecordKeys(Configuration configuration, Path filePath);
/**
* Read the min and max record key from the metadata of the given data file.
* @param configuration Configuration
* @param filePath The data file path
* @return A array of two string where the first is min record key and the second is max record key
*/
public String[] readMinMaxRecordKeys(Configuration configuration, Path filePath) {
Map<String, String> minMaxKeys = readFooter(configuration, true, filePath,
HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
if (minMaxKeys.size() != 2) {
throw new HoodieException(
String.format("Could not read min/max record key out of footer correctly from %s. read) : %s",
filePath, minMaxKeys));
}
return new String[] {minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER),
minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)};
}
/**
* Read the data file
* NOTE: This literally reads the entire file contents, thus should be used with caution.
* @param configuration Configuration
* @param filePath The data file path
* @return A list of GenericRecord
*/
public abstract List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath);
/**
* Read the data file using the given schema
* NOTE: This literally reads the entire file contents, thus should be used with caution.
* @param configuration Configuration
* @param filePath The data file path
* @return A list of GenericRecord
*/
public abstract List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema schema);
public abstract Map<String, String> readFooter(Configuration conf, boolean required, Path orcFilePath,
String... footerNames);
/**
* Read the footer data of the given data file.
* @param configuration Configuration
* @param required require the footer data to be in data file
* @param filePath The data file path
* @param footerNames The footer names to read
* @return A map where the key is the footer name and the value is the footer value
*/
public abstract Map<String, String> readFooter(Configuration configuration, boolean required, Path filePath,
String... footerNames);
public abstract long getRowCount(Configuration conf, Path filePath);
}
/**
* Returns the number of records in the data file.
* @param configuration Configuration
* @param filePath The data file path
*/
public abstract long getRowCount(Configuration configuration, Path filePath);
/**
* Read the rowKey list matching the given filter, from the given data file.
* If the filter is empty, then this will return all the row keys.
* @param filePath The data file path
* @param configuration configuration to build fs object
* @param filter record keys filter
* @return Set Set of row keys matching candidateRecordKeys
*/
public abstract Set<String> filterRowKeys(Configuration configuration, Path filePath, Set<String> filter);
/**
* Fetch {@link HoodieKey}s from the given data file.
* @param configuration configuration to build fs object
* @param filePath The data file path
* @return {@link List} of {@link HoodieKey}s fetched from the parquet file
*/
public abstract List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath);
/**
* Read the Avro schema of the data file.
* @param configuration Configuration
* @param filePath The data file path
* @return The Avro schema of the data file
*/
public abstract Schema readAvroSchema(Configuration configuration, Path filePath);
}

View File

@@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import java.io.IOException;
import java.util.Iterator;
/**
* This class wraps a ORC reader and provides an iterator based api to read from an ORC file.
*/
public class OrcReaderIterator<T> implements Iterator<T> {
private final RecordReader recordReader;
private final Schema avroSchema;
List<String> fieldNames;
List<TypeDescription> orcFieldTypes;
Schema[] avroFieldSchemas;
private VectorizedRowBatch batch;
private int rowInBatch;
private T next;
public OrcReaderIterator(RecordReader recordReader, Schema schema, TypeDescription orcSchema) {
this.recordReader = recordReader;
this.avroSchema = schema;
this.fieldNames = orcSchema.getFieldNames();
this.orcFieldTypes = orcSchema.getChildren();
this.avroFieldSchemas = fieldNames.stream()
.map(fieldName -> avroSchema.getField(fieldName).schema())
.toArray(size -> new Schema[size]);
this.batch = orcSchema.createRowBatch();
this.rowInBatch = 0;
}
/**
* If the current batch is empty, get a new one.
* @return true if we have rows available.
* @throws IOException
*/
private boolean ensureBatch() throws IOException {
if (rowInBatch >= batch.size) {
rowInBatch = 0;
return recordReader.nextBatch(batch);
}
return true;
}
@Override
public boolean hasNext() {
try {
ensureBatch();
if (this.next == null) {
this.next = (T) readRecordFromBatch();
}
return this.next != null;
} catch (IOException io) {
throw new HoodieIOException("unable to read next record from ORC file ", io);
}
}
@Override
public T next() {
try {
// To handle case when next() is called before hasNext()
if (this.next == null) {
if (!hasNext()) {
throw new HoodieIOException("No more records left to read from ORC file");
}
}
T retVal = this.next;
this.next = (T) readRecordFromBatch();
return retVal;
} catch (IOException io) {
throw new HoodieIOException("unable to read next record from ORC file ", io);
}
}
private GenericData.Record readRecordFromBatch() throws IOException {
// No more records left to read from ORC file
if (!ensureBatch()) {
return null;
}
GenericData.Record record = new Record(avroSchema);
int numFields = orcFieldTypes.size();
for (int i = 0; i < numFields; i++) {
Object data = AvroOrcUtils.readFromVector(orcFieldTypes.get(i), batch.cols[i], avroFieldSchemas[i], rowInBatch);
record.put(fieldNames.get(i), data);
}
rowInBatch++;
return record;
}
}

View File

@@ -0,0 +1,235 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.MetadataNotFoundException;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto.UserMetadataItem;
import org.apache.orc.Reader;
import org.apache.orc.Reader.Options;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
/**
* Utility functions for ORC files.
*/
public class OrcUtils extends BaseFileUtils {
/**
* Fetch {@link HoodieKey}s from the given ORC file.
*
* @param filePath The ORC file path.
* @param configuration configuration to build fs object
* @return {@link List} of {@link HoodieKey}s fetched from the ORC file
*/
@Override
public List<HoodieKey> fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) {
List<HoodieKey> hoodieKeys = new ArrayList<>();
try {
if (!filePath.getFileSystem(configuration).exists(filePath)) {
return new ArrayList<>();
}
Configuration conf = new Configuration(configuration);
conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema);
List<String> fieldNames = orcSchema.getFieldNames();
VectorizedRowBatch batch = orcSchema.createRowBatch();
RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
// column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields
int keyCol = -1;
int partitionCol = -1;
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
keyCol = i;
}
if (fieldNames.get(i).equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) {
partitionCol = i;
}
}
if (keyCol == -1 || partitionCol == -1) {
throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath));
}
while (recordReader.nextBatch(batch)) {
BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[keyCol];
BytesColumnVector partitionPaths = (BytesColumnVector) batch.cols[partitionCol];
for (int i = 0; i < batch.size; i++) {
String rowKey = rowKeys.toString(i);
String partitionPath = partitionPaths.toString(i);
hoodieKeys.add(new HoodieKey(rowKey, partitionPath));
}
}
} catch (IOException e) {
throw new HoodieIOException("Failed to read from ORC file:" + filePath, e);
}
return hoodieKeys;
}
/**
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*/
@Override
public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath) {
Schema avroSchema;
try {
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration));
avroSchema = AvroOrcUtils.createAvroSchema(reader.getSchema());
} catch (IOException io) {
throw new HoodieIOException("Unable to read Avro records from an ORC file:" + filePath, io);
}
return readAvroRecords(configuration, filePath, avroSchema);
}
/**
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*/
@Override
public List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) {
List<GenericRecord> records = new ArrayList<>();
try {
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration));
TypeDescription orcSchema = reader.getSchema();
RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema));
OrcReaderIterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
while (iterator.hasNext()) {
GenericRecord record = iterator.next();
records.add(record);
}
} catch (IOException io) {
throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io);
}
return records;
}
/**
* Read the rowKey list matching the given filter, from the given ORC file. If the filter is empty, then this will
* return all the rowkeys.
*
* @param conf configuration to build fs object.
* @param filePath The ORC file path.
* @param filter record keys filter
* @return Set Set of row keys matching candidateRecordKeys
*/
@Override
public Set<String> filterRowKeys(Configuration conf, Path filePath, Set<String> filter)
throws HoodieIOException {
try {
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
Set<String> filteredRowKeys = new HashSet<>();
TypeDescription schema = reader.getSchema();
List<String> fieldNames = schema.getFieldNames();
VectorizedRowBatch batch = schema.createRowBatch();
RecordReader recordReader = reader.rows(new Options(conf).schema(schema));
// column index for the RECORD_KEY_METADATA_FIELD field
int colIndex = -1;
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) {
colIndex = i;
break;
}
}
if (colIndex == -1) {
throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath));
}
while (recordReader.nextBatch(batch)) {
BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex];
for (int i = 0; i < batch.size; i++) {
String rowKey = rowKeys.toString(i);
if (filter.isEmpty() || filter.contains(rowKey)) {
filteredRowKeys.add(rowKey);
}
}
}
return filteredRowKeys;
} catch (IOException io) {
throw new HoodieIOException("Unable to read row keys for ORC file:" + filePath, io);
}
}
@Override
public Map<String, String> readFooter(Configuration conf, boolean required,
Path orcFilePath, String... footerNames) {
try {
Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
Map<String, String> footerVals = new HashMap<>();
List<UserMetadataItem> metadataItemList = reader.getFileTail().getFooter().getMetadataList();
Map<String, String> metadata = metadataItemList.stream().collect(Collectors.toMap(
UserMetadataItem::getName,
metadataItem -> metadataItem.getValue().toStringUtf8()));
for (String footerName : footerNames) {
if (metadata.containsKey(footerName)) {
footerVals.put(footerName, metadata.get(footerName));
} else if (required) {
throw new MetadataNotFoundException(
"Could not find index in ORC footer. Looked for key " + footerName + " in "
+ orcFilePath);
}
}
return footerVals;
} catch (IOException io) {
throw new HoodieIOException("Unable to read footer for ORC file:" + orcFilePath, io);
}
}
@Override
public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
try {
Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
TypeDescription orcSchema = reader.getSchema();
return AvroOrcUtils.createAvroSchema(orcSchema);
} catch (IOException io) {
throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
}
}
@Override
public long getRowCount(Configuration conf, Path orcFilePath) {
try {
Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
return reader.getNumberOfRows();
} catch (IOException io) {
throw new HoodieIOException("Unable to get row count for ORC file:" + orcFilePath, io);
}
}
}

View File

@@ -19,14 +19,9 @@
package org.apache.hudi.common.util;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.HoodieAvroWriteSupport;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.MetadataNotFoundException;
@@ -57,18 +52,6 @@ import java.util.function.Function;
*/
public class ParquetUtils extends BaseFileUtils {
/**
* Read the rowKey list from the given parquet file.
*
* @param filePath The parquet file path.
* @param configuration configuration to build fs object
* @return Set Set of row keys
*/
@Override
public Set<String> readRowKeys(Configuration configuration, Path filePath) {
return filterRowKeys(configuration, filePath, new HashSet<>());
}
/**
* Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
* return all the rowkeys.
@@ -196,47 +179,8 @@ public class ParquetUtils extends BaseFileUtils {
@Override
public Schema readAvroSchema(Configuration configuration, Path parquetFilePath) {
return new AvroSchemaConverter(configuration).convert(readSchema(configuration, parquetFilePath));
}
/**
* Read out the bloom filter from the parquet file meta data.
*/
@Override
public BloomFilter readBloomFilterFromMetadata(Configuration configuration, Path parquetFilePath) {
Map<String, String> footerVals =
readFooter(configuration, false, parquetFilePath,
HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE);
String footerVal = footerVals.get(HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
if (null == footerVal) {
// We use old style key "com.uber.hoodie.bloomfilter"
footerVal = footerVals.get(HoodieAvroWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
}
BloomFilter toReturn = null;
if (footerVal != null) {
if (footerVals.containsKey(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) {
toReturn = BloomFilterFactory.fromString(footerVal,
footerVals.get(HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE));
} else {
toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name());
}
}
return toReturn;
}
@Override
public String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) {
Map<String, String> minMaxKeys = readFooter(configuration, true, parquetFilePath,
HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
if (minMaxKeys.size() != 2) {
throw new HoodieException(
String.format("Could not read min/max record key out of footer correctly from %s. read) : %s",
parquetFilePath, minMaxKeys));
}
return new String[] {minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER),
minMaxKeys.get(HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)};
MessageType parquetSchema = readSchema(configuration, parquetFilePath);
return new AvroSchemaConverter(configuration).convert(parquetSchema);
}
/**

View File

@@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import java.io.IOException;
import static org.apache.hudi.common.model.HoodieFileFormat.ORC;
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
import static org.apache.hudi.common.model.HoodieFileFormat.HFILE;
@@ -40,6 +41,9 @@ public class HoodieFileReaderFactory {
if (HFILE.getFileExtension().equals(extension)) {
return newHFileFileReader(conf, path);
}
if (ORC.getFileExtension().equals(extension)) {
return newOrcFileReader(conf, path);
}
throw new UnsupportedOperationException(extension + " format not supported yet.");
}
@@ -52,4 +56,8 @@ public class HoodieFileReaderFactory {
CacheConfig cacheConfig = new CacheConfig(conf);
return new HoodieHFileReader<>(conf, path, cacheConfig);
}
private static <R extends IndexedRecord> HoodieFileReader<R> newOrcFileReader(Configuration conf, Path path) {
return new HoodieOrcReader<>(conf, path);
}
}

View File

@@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io.storage;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.util.AvroOrcUtils;
import org.apache.hudi.common.util.BaseFileUtils;
import org.apache.hudi.common.util.OrcReaderIterator;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.Reader.Options;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
public class HoodieOrcReader<R extends IndexedRecord> implements HoodieFileReader {
private Path path;
private Configuration conf;
private final BaseFileUtils orcUtils;
public HoodieOrcReader(Configuration configuration, Path path) {
this.conf = configuration;
this.path = path;
this.orcUtils = BaseFileUtils.getInstance(HoodieFileFormat.ORC);
}
@Override
public String[] readMinMaxRecordKeys() {
return orcUtils.readMinMaxRecordKeys(conf, path);
}
@Override
public BloomFilter readBloomFilter() {
return orcUtils.readBloomFilterFromMetadata(conf, path);
}
@Override
public Set<String> filterRowKeys(Set candidateRowKeys) {
return orcUtils.filterRowKeys(conf, path, candidateRowKeys);
}
@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
try {
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema);
RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
return new OrcReaderIterator(recordReader, schema, orcSchema);
} catch (IOException io) {
throw new HoodieIOException("Unable to create an ORC reader.", io);
}
}
@Override
public Schema getSchema() {
return orcUtils.readAvroSchema(conf, path);
}
@Override
public void close() {
}
@Override
public long getTotalRecords() {
return orcUtils.getRowCount(conf, path);
}
}