HUDI-1827 : Add ORC support in Bootstrap Op (#3457)
Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
@@ -52,6 +52,7 @@ import org.apache.orc.storage.ql.exec.vector.UnionColumnVector;
|
||||
import org.apache.orc.storage.serde2.io.DateWritable;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.orc.TypeDescription;
|
||||
import static org.apache.avro.JsonProperties.NULL_VALUE;
|
||||
|
||||
/**
|
||||
* Methods including addToVector, addUnionValue, createOrcSchema are originally from
|
||||
@@ -796,4 +797,78 @@ public class AvroOrcUtils {
|
||||
return Schema.createUnion(nonNullMembers);
|
||||
}
|
||||
}
|
||||
|
||||
public static Schema createAvroSchemaWithDefaultValue(TypeDescription orcSchema, String recordName, String namespace, boolean nullable) {
|
||||
Schema avroSchema = createAvroSchemaWithNamespace(orcSchema,recordName,namespace);
|
||||
List<Schema.Field> fields = new ArrayList<Schema.Field>();
|
||||
List<Field> fieldList = avroSchema.getFields();
|
||||
for (Field field : fieldList) {
|
||||
Schema fieldSchema = field.schema();
|
||||
Schema nullableSchema = Schema.createUnion(Schema.create(Schema.Type.NULL),fieldSchema);
|
||||
if (nullable) {
|
||||
fields.add(new Schema.Field(field.name(), nullableSchema, null, NULL_VALUE));
|
||||
} else {
|
||||
fields.add(new Schema.Field(field.name(), fieldSchema, null, (Object) null));
|
||||
}
|
||||
}
|
||||
Schema schema = Schema.createRecord(recordName, null, null, false);
|
||||
schema.setFields(fields);
|
||||
return schema;
|
||||
}
|
||||
|
||||
private static Schema createAvroSchemaWithNamespace(TypeDescription orcSchema, String recordName, String namespace) {
|
||||
switch (orcSchema.getCategory()) {
|
||||
case BOOLEAN:
|
||||
return Schema.create(Schema.Type.BOOLEAN);
|
||||
case BYTE:
|
||||
// tinyint (8 bit), use int to hold it
|
||||
return Schema.create(Schema.Type.INT);
|
||||
case SHORT:
|
||||
// smallint (16 bit), use int to hold it
|
||||
return Schema.create(Schema.Type.INT);
|
||||
case INT:
|
||||
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but there is no way to distinguish
|
||||
return Schema.create(Schema.Type.INT);
|
||||
case LONG:
|
||||
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but there is no way to distinguish
|
||||
return Schema.create(Schema.Type.LONG);
|
||||
case FLOAT:
|
||||
return Schema.create(Schema.Type.FLOAT);
|
||||
case DOUBLE:
|
||||
return Schema.create(Schema.Type.DOUBLE);
|
||||
case VARCHAR:
|
||||
case CHAR:
|
||||
case STRING:
|
||||
return Schema.create(Schema.Type.STRING);
|
||||
case DATE:
|
||||
Schema date = Schema.create(Schema.Type.INT);
|
||||
LogicalTypes.date().addToSchema(date);
|
||||
return date;
|
||||
case TIMESTAMP:
|
||||
Schema timestamp = Schema.create(Schema.Type.LONG);
|
||||
LogicalTypes.timestampMillis().addToSchema(timestamp);
|
||||
return timestamp;
|
||||
case BINARY:
|
||||
return Schema.create(Schema.Type.BYTES);
|
||||
case DECIMAL:
|
||||
Schema decimal = Schema.create(Schema.Type.BYTES);
|
||||
LogicalTypes.decimal(orcSchema.getPrecision(), orcSchema.getScale()).addToSchema(decimal);
|
||||
return decimal;
|
||||
case LIST:
|
||||
return Schema.createArray(createAvroSchemaWithNamespace(orcSchema.getChildren().get(0), recordName, ""));
|
||||
case MAP:
|
||||
return Schema.createMap(createAvroSchemaWithNamespace(orcSchema.getChildren().get(1), recordName, ""));
|
||||
case STRUCT:
|
||||
List<Field> childFields = new ArrayList<>();
|
||||
for (int i = 0; i < orcSchema.getChildren().size(); i++) {
|
||||
TypeDescription childType = orcSchema.getChildren().get(i);
|
||||
String childName = orcSchema.getFieldNames().get(i);
|
||||
childFields.add(new Field(childName, createAvroSchemaWithNamespace(childType, childName, ""), null, null));
|
||||
}
|
||||
return Schema.createRecord(recordName, null, namespace, false, childFields);
|
||||
default:
|
||||
throw new IllegalStateException(String.format("Unrecognized ORC type: %s", orcSchema.getCategory().getName()));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,6 @@
|
||||
package org.apache.hudi.common.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@@ -50,8 +48,6 @@ import org.apache.orc.Reader.Options;
|
||||
import org.apache.orc.RecordReader;
|
||||
import org.apache.orc.TypeDescription;
|
||||
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_SCHEMA_METADATA_KEY;
|
||||
|
||||
/**
|
||||
* Utility functions for ORC files.
|
||||
*/
|
||||
@@ -226,9 +222,8 @@ public class OrcUtils extends BaseFileUtils {
|
||||
public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
|
||||
try {
|
||||
Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
|
||||
ByteBuffer schemaBuffer = reader.getMetadataValue(HOODIE_AVRO_SCHEMA_METADATA_KEY);
|
||||
String schemaText = StandardCharsets.UTF_8.decode(schemaBuffer).toString();
|
||||
return new Schema.Parser().parse(schemaText);
|
||||
TypeDescription orcSchema = reader.getSchema();
|
||||
return AvroOrcUtils.createAvroSchema(orcSchema);
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
|
||||
}
|
||||
|
||||
@@ -111,6 +111,13 @@ public class HoodieTestUtils {
|
||||
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
|
||||
}
|
||||
|
||||
public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, HoodieFileFormat baseFileFormat) throws IOException {
|
||||
Properties props = new Properties();
|
||||
props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath);
|
||||
props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), baseFileFormat.name());
|
||||
return init(getDefaultHadoopConf(), basePath, tableType, props);
|
||||
}
|
||||
|
||||
public static <T extends Serializable> T serializeDeserialize(T object, Class<T> clazz) {
|
||||
// Using Kryo as the default serializer in Spark Jobs
|
||||
Kryo kryo = new Kryo();
|
||||
|
||||
Reference in New Issue
Block a user