1
0

HUDI-1827 : Add ORC support in Bootstrap Op (#3457)

Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
manasaks
2021-11-06 21:53:20 +05:30
committed by GitHub
parent f41539a9cb
commit e0285800fb
16 changed files with 1187 additions and 70 deletions

View File

@@ -52,6 +52,7 @@ import org.apache.orc.storage.ql.exec.vector.UnionColumnVector;
import org.apache.orc.storage.serde2.io.DateWritable;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.orc.TypeDescription;
import static org.apache.avro.JsonProperties.NULL_VALUE;
/**
* Methods including addToVector, addUnionValue, createOrcSchema are originally from
@@ -796,4 +797,78 @@ public class AvroOrcUtils {
return Schema.createUnion(nonNullMembers);
}
}
public static Schema createAvroSchemaWithDefaultValue(TypeDescription orcSchema, String recordName, String namespace, boolean nullable) {
Schema avroSchema = createAvroSchemaWithNamespace(orcSchema,recordName,namespace);
List<Schema.Field> fields = new ArrayList<Schema.Field>();
List<Field> fieldList = avroSchema.getFields();
for (Field field : fieldList) {
Schema fieldSchema = field.schema();
Schema nullableSchema = Schema.createUnion(Schema.create(Schema.Type.NULL),fieldSchema);
if (nullable) {
fields.add(new Schema.Field(field.name(), nullableSchema, null, NULL_VALUE));
} else {
fields.add(new Schema.Field(field.name(), fieldSchema, null, (Object) null));
}
}
Schema schema = Schema.createRecord(recordName, null, null, false);
schema.setFields(fields);
return schema;
}
private static Schema createAvroSchemaWithNamespace(TypeDescription orcSchema, String recordName, String namespace) {
switch (orcSchema.getCategory()) {
case BOOLEAN:
return Schema.create(Schema.Type.BOOLEAN);
case BYTE:
// tinyint (8 bit), use int to hold it
return Schema.create(Schema.Type.INT);
case SHORT:
// smallint (16 bit), use int to hold it
return Schema.create(Schema.Type.INT);
case INT:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish
return Schema.create(Schema.Type.INT);
case LONG:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish
return Schema.create(Schema.Type.LONG);
case FLOAT:
return Schema.create(Schema.Type.FLOAT);
case DOUBLE:
return Schema.create(Schema.Type.DOUBLE);
case VARCHAR:
case CHAR:
case STRING:
return Schema.create(Schema.Type.STRING);
case DATE:
Schema date = Schema.create(Schema.Type.INT);
LogicalTypes.date().addToSchema(date);
return date;
case TIMESTAMP:
Schema timestamp = Schema.create(Schema.Type.LONG);
LogicalTypes.timestampMillis().addToSchema(timestamp);
return timestamp;
case BINARY:
return Schema.create(Schema.Type.BYTES);
case DECIMAL:
Schema decimal = Schema.create(Schema.Type.BYTES);
LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal);
return decimal;
case LIST:
return Schema.createArray(createAvroSchemaWithNamespace(orcSchema.getChildren().get(0), recordName, ""));
case MAP:
return Schema.createMap(createAvroSchemaWithNamespace(orcSchema.getChildren().get(1), recordName, ""));
case STRUCT:
List<Field> childFields = new ArrayList<>();
for (int i = 0; i < orcSchema.getChildren().size(); i++) {
TypeDescription childType = orcSchema.getChildren().get(i);
String childName = orcSchema.getFieldNames().get(i);
childFields.add(new Field(childName, createAvroSchemaWithNamespace(childType, childName, ""), null, null));
}
return Schema.createRecord(recordName, null, namespace, false, childFields);
default:
throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName()));
}
}
}

View File

@@ -19,8 +19,6 @@
package org.apache.hudi.common.util;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@@ -50,8 +48,6 @@ import org.apache.orc.Reader.Options;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_SCHEMA_METADATA_KEY;
/**
* Utility functions for ORC files.
*/
@@ -226,9 +222,8 @@ public class OrcUtils extends BaseFileUtils {
public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
try {
Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
ByteBuffer schemaBuffer = reader.getMetadataValue(HOODIE_AVRO_SCHEMA_METADATA_KEY);
String schemaText = StandardCharsets.UTF_8.decode(schemaBuffer).toString();
return new Schema.Parser().parse(schemaText);
TypeDescription orcSchema = reader.getSchema();
return AvroOrcUtils.createAvroSchema(orcSchema);
} catch (IOException io) {
throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
}

View File

@@ -111,6 +111,13 @@ public class HoodieTestUtils {
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
}
public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, HoodieFileFormat baseFileFormat) throws IOException {
Properties props = new Properties();
props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath);
props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), baseFileFormat.name());
return init(getDefaultHadoopConf(), basePath, tableType, props);
}
public static <T extends Serializable> T serializeDeserialize(T object, Class<T> clazz) {
// Using Kryo as the default serializer in Spark Jobs
Kryo kryo = new Kryo();