[RFC-33] [HUDI-2429][Stacked on HUDI-2560] Support full Schema evolution for Spark (#4910)
* [HUDI-2560] introduce id_based schema to support full schema evolution. * add test for FileBasedInternalSchemaStorageManger and rebase code * add support for change column type and fix some test case * fix some bugs encountered in the production env and delete useless code * fix test error * rebase code * fixed some nested schema change bugs * [HUDI-2429][Stacked On HUDI-2560]Support full schema evolution for spark * [use dummyInternalSchema instead of null] * add support for spark3.1.x * remove support for spark3.1.x , sicne some compile fail * support spark3.1.x * rebase and prepare solve all comments * address all comments * rebase code * fixed the count(*) bug * try to get internalSchema by parser commit file/history file directly, not use metaclient which is time cost address some comments * fixed all comments * fix new comments * rebase code,fix UT failed * fixed mistake * rebase code ,fixed new comments * rebase code , and prepare for address new comments * address commits * address new comments * fix new issues * control fallback original write logical
This commit is contained in:
@@ -0,0 +1,502 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.utils;
|
||||
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.action.InternalSchemaMerger;
|
||||
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
|
||||
import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
|
||||
import org.apache.spark.sql.types.ArrayType;
|
||||
import org.apache.spark.sql.types.ArrayType$;
|
||||
import org.apache.spark.sql.types.BinaryType;
|
||||
import org.apache.spark.sql.types.BinaryType$;
|
||||
import org.apache.spark.sql.types.BooleanType;
|
||||
import org.apache.spark.sql.types.BooleanType$;
|
||||
import org.apache.spark.sql.types.ByteType;
|
||||
import org.apache.spark.sql.types.CharType;
|
||||
import org.apache.spark.sql.types.DataType;
|
||||
import org.apache.spark.sql.types.DateType;
|
||||
import org.apache.spark.sql.types.DateType$;
|
||||
import org.apache.spark.sql.types.Decimal;
|
||||
import org.apache.spark.sql.types.DecimalType;
|
||||
import org.apache.spark.sql.types.DecimalType$;
|
||||
import org.apache.spark.sql.types.DoubleType;
|
||||
import org.apache.spark.sql.types.DoubleType$;
|
||||
import org.apache.spark.sql.types.FloatType;
|
||||
import org.apache.spark.sql.types.FloatType$;
|
||||
import org.apache.spark.sql.types.IntegerType;
|
||||
import org.apache.spark.sql.types.IntegerType$;
|
||||
import org.apache.spark.sql.types.LongType;
|
||||
import org.apache.spark.sql.types.LongType$;
|
||||
import org.apache.spark.sql.types.MapType;
|
||||
import org.apache.spark.sql.types.MapType$;
|
||||
import org.apache.spark.sql.types.Metadata;
|
||||
import org.apache.spark.sql.types.ShortType;
|
||||
import org.apache.spark.sql.types.StringType;
|
||||
import org.apache.spark.sql.types.StringType$;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import org.apache.spark.sql.types.StructType$;
|
||||
import org.apache.spark.sql.types.TimestampType;
|
||||
import org.apache.spark.sql.types.TimestampType$;
|
||||
import org.apache.spark.sql.types.UserDefinedType;
|
||||
import org.apache.spark.sql.types.VarcharType;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.Date;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SparkInternalSchemaConverter {
|
||||
private SparkInternalSchemaConverter() {
|
||||
|
||||
}
|
||||
|
||||
public static final String HOODIE_QUERY_SCHEMA = "hoodie.schema.internal.querySchema";
|
||||
public static final String HOODIE_TABLE_PATH = "hoodie.tablePath";
|
||||
public static final String HOODIE_VALID_COMMITS_LIST = "hoodie.valid.commits.list";
|
||||
|
||||
/**
|
||||
* Converts a spark schema to an hudi internal schema. Fields without IDs are kept and assigned fallback IDs.
|
||||
*
|
||||
* @param sparkSchema a spark schema
|
||||
* @return a matching internal schema for the provided spark schema
|
||||
*/
|
||||
public static InternalSchema convertStructTypeToInternalSchema(StructType sparkSchema) {
|
||||
Type newType = buildTypeFromStructType(sparkSchema, true, new AtomicInteger(0));
|
||||
return new InternalSchema(((Types.RecordType)newType).fields());
|
||||
}
|
||||
|
||||
public static Type buildTypeFromStructType(DataType sparkType, Boolean firstVisitRoot, AtomicInteger nextId) {
|
||||
if (sparkType instanceof StructType) {
|
||||
StructField[] fields = ((StructType) sparkType).fields();
|
||||
int nextAssignId = firstVisitRoot ? 0 : nextId.get();
|
||||
nextId.set(nextAssignId + fields.length);
|
||||
List<Type> newTypes = new ArrayList<>();
|
||||
for (StructField f : fields) {
|
||||
newTypes.add(buildTypeFromStructType(f.dataType(), false, nextId));
|
||||
}
|
||||
List<Types.Field> newFields = new ArrayList<>();
|
||||
for (int i = 0; i < newTypes.size(); i++) {
|
||||
StructField f = fields[i];
|
||||
newFields.add(Types.Field.get(nextAssignId + i, f.nullable(), f.name(), newTypes.get(i),
|
||||
f.getComment().isDefined() ? f.getComment().get() : null));
|
||||
}
|
||||
return Types.RecordType.get(newFields);
|
||||
} else if (sparkType instanceof MapType) {
|
||||
MapType map = (MapType) sparkType;
|
||||
DataType keyType = map.keyType();
|
||||
DataType valueType = map.valueType();
|
||||
int keyId = nextId.get();
|
||||
int valueId = keyId + 1;
|
||||
nextId.set(valueId + 1);
|
||||
return Types.MapType.get(keyId, valueId, buildTypeFromStructType(keyType, false, nextId),
|
||||
buildTypeFromStructType(valueType, false, nextId), map.valueContainsNull());
|
||||
} else if (sparkType instanceof ArrayType) {
|
||||
ArrayType array = (ArrayType) sparkType;
|
||||
DataType et = array.elementType();
|
||||
int elementId = nextId.get();
|
||||
nextId.set(elementId + 1);
|
||||
return Types.ArrayType.get(elementId, array.containsNull(), buildTypeFromStructType(et, false, nextId));
|
||||
} else if (sparkType instanceof UserDefinedType) {
|
||||
throw new UnsupportedOperationException("User-defined types are not supported");
|
||||
} else if (sparkType instanceof BooleanType) {
|
||||
return Types.BooleanType.get();
|
||||
} else if (sparkType instanceof IntegerType
|
||||
|| sparkType instanceof ShortType
|
||||
|| sparkType instanceof ByteType) {
|
||||
return Types.IntType.get();
|
||||
} else if (sparkType instanceof LongType) {
|
||||
return Types.LongType.get();
|
||||
} else if (sparkType instanceof FloatType) {
|
||||
return Types.FloatType.get();
|
||||
} else if (sparkType instanceof DoubleType) {
|
||||
return Types.DoubleType.get();
|
||||
} else if (sparkType instanceof StringType
|
||||
|| sparkType instanceof CharType
|
||||
|| sparkType instanceof VarcharType) {
|
||||
return Types.StringType.get();
|
||||
} else if (sparkType instanceof DateType) {
|
||||
return Types.DateType.get();
|
||||
// spark 3.3.0 support TimeStampNTZ, to do support spark3.3.0
|
||||
} else if (sparkType instanceof TimestampType) {
|
||||
return Types.TimestampType.get();
|
||||
} else if (sparkType instanceof DecimalType) {
|
||||
return Types.DecimalType.get(
|
||||
((DecimalType) sparkType).precision(),
|
||||
((DecimalType) sparkType).scale());
|
||||
} else if (sparkType instanceof BinaryType) {
|
||||
return Types.BinaryType.get();
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Not a supported type: %s", sparkType.catalogString()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts Spark schema to Hudi internal schema, and prune fields.
|
||||
* Fields without IDs are kept and assigned fallback IDs.
|
||||
*
|
||||
* @param sparkSchema a pruned spark schema
|
||||
* @param originSchema a internal schema for hoodie table
|
||||
* @return a pruned internal schema for the provided spark schema
|
||||
*/
|
||||
public static InternalSchema convertAndPruneStructTypeToInternalSchema(StructType sparkSchema, InternalSchema originSchema) {
|
||||
List<String> pruneNames = collectColNamesFromSparkStruct(sparkSchema);
|
||||
return InternalSchemaUtils.pruneInternalSchema(originSchema, pruneNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect all the leaf nodes names.
|
||||
*
|
||||
* @param sparkSchema a spark schema
|
||||
* @return leaf nodes full names.
|
||||
*/
|
||||
public static List<String> collectColNamesFromSparkStruct(StructType sparkSchema) {
|
||||
List<String> result = new ArrayList<>();
|
||||
collectColNamesFromStructType(sparkSchema, new LinkedList<>(), result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static void collectColNamesFromStructType(DataType sparkType, Deque<String> fieldNames, List<String> resultSet) {
|
||||
if (sparkType instanceof StructType) {
|
||||
StructField[] fields = ((StructType) sparkType).fields();
|
||||
for (StructField f : fields) {
|
||||
fieldNames.push(f.name());
|
||||
collectColNamesFromStructType(f.dataType(), fieldNames, resultSet);
|
||||
fieldNames.pop();
|
||||
addFullName(f.dataType(), f.name(), fieldNames, resultSet);
|
||||
}
|
||||
} else if (sparkType instanceof MapType) {
|
||||
MapType map = (MapType) sparkType;
|
||||
DataType keyType = map.keyType();
|
||||
DataType valueType = map.valueType();
|
||||
// key
|
||||
fieldNames.push("key");
|
||||
collectColNamesFromStructType(keyType, fieldNames, resultSet);
|
||||
fieldNames.pop();
|
||||
addFullName(keyType,"key", fieldNames, resultSet);
|
||||
// value
|
||||
fieldNames.push("value");
|
||||
collectColNamesFromStructType(valueType, fieldNames, resultSet);
|
||||
fieldNames.poll();
|
||||
addFullName(valueType,"value", fieldNames, resultSet);
|
||||
} else if (sparkType instanceof ArrayType) {
|
||||
ArrayType array = (ArrayType) sparkType;
|
||||
DataType et = array.elementType();
|
||||
fieldNames.push("element");
|
||||
collectColNamesFromStructType(et, fieldNames, resultSet);
|
||||
fieldNames.pop();
|
||||
addFullName(et, "element", fieldNames, resultSet);
|
||||
} else if (sparkType instanceof UserDefinedType) {
|
||||
throw new UnsupportedOperationException("User-defined types are not supported");
|
||||
} else {
|
||||
// do nothings
|
||||
}
|
||||
}
|
||||
|
||||
private static void addFullName(DataType sparkType, String name, Deque<String> fieldNames, List<String> resultSet) {
|
||||
if (!(sparkType instanceof StructType) && !(sparkType instanceof ArrayType) && !(sparkType instanceof MapType)) {
|
||||
resultSet.add(InternalSchemaUtils.createFullName(name, fieldNames));
|
||||
}
|
||||
}
|
||||
|
||||
public static StructType mergeSchema(InternalSchema fileSchema, InternalSchema querySchema) {
|
||||
InternalSchema schema = new InternalSchemaMerger(fileSchema, querySchema, true, true).mergeSchema();
|
||||
return constructSparkSchemaFromInternalSchema(schema);
|
||||
}
|
||||
|
||||
public static Map<Integer, Pair<DataType, DataType>> collectTypeChangedCols(InternalSchema schema, InternalSchema other) {
|
||||
return InternalSchemaUtils
|
||||
.collectTypeChangedCols(schema, other)
|
||||
.entrySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(e -> e.getKey(), e -> Pair.of(constructSparkSchemaFromType(e.getValue().getLeft()), constructSparkSchemaFromType(e.getValue().getRight()))));
|
||||
}
|
||||
|
||||
public static StructType constructSparkSchemaFromInternalSchema(InternalSchema schema) {
|
||||
return (StructType) constructSparkSchemaFromType(schema.getRecord());
|
||||
}
|
||||
|
||||
private static DataType constructSparkSchemaFromType(Type type) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Types.Field> fields = record.fields();
|
||||
List<StructField> structFields = new ArrayList<>();
|
||||
for (Types.Field f : fields) {
|
||||
DataType dataType = constructSparkSchemaFromType(f.type());
|
||||
StructField structField = StructField.apply(f.name(), dataType, f.isOptional(), Metadata.empty());
|
||||
structField = f.doc() == null ? structField : structField.withComment(f.doc());
|
||||
structFields.add(structField);
|
||||
}
|
||||
return StructType$.MODULE$.apply(structFields);
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
DataType elementType = constructSparkSchemaFromType(array.elementType());
|
||||
return ArrayType$.MODULE$.apply(elementType, array.isElementOptional());
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
DataType keyDataType = constructSparkSchemaFromType(map.keyType());
|
||||
DataType valueDataType = constructSparkSchemaFromType(map.valueType());
|
||||
return MapType$.MODULE$.apply(keyDataType, valueDataType, map.isValueOptional());
|
||||
case BOOLEAN:
|
||||
return BooleanType$.MODULE$;
|
||||
case INT:
|
||||
return IntegerType$.MODULE$;
|
||||
case LONG:
|
||||
return LongType$.MODULE$;
|
||||
case FLOAT:
|
||||
return FloatType$.MODULE$;
|
||||
case DOUBLE:
|
||||
return DoubleType$.MODULE$;
|
||||
case DATE:
|
||||
return DateType$.MODULE$;
|
||||
case TIME:
|
||||
throw new UnsupportedOperationException(String.format("cannot convert %s type to Spark", type));
|
||||
case TIMESTAMP:
|
||||
// todo support TimeStampNTZ
|
||||
return TimestampType$.MODULE$;
|
||||
case STRING:
|
||||
return StringType$.MODULE$;
|
||||
case UUID:
|
||||
return StringType$.MODULE$;
|
||||
case FIXED:
|
||||
return BinaryType$.MODULE$;
|
||||
case BINARY:
|
||||
return BinaryType$.MODULE$;
|
||||
case DECIMAL:
|
||||
Types.DecimalType decimal = (Types.DecimalType) type;
|
||||
return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale());
|
||||
default:
|
||||
throw new UnsupportedOperationException(String.format("cannot convert unknown type: %s to Spark", type));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Int/long type to other Type.
|
||||
* Now only support int/long -> long/float/double/string
|
||||
* TODO: support more types
|
||||
*/
|
||||
private static boolean convertIntLongType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) {
|
||||
boolean isInt = oldV.dataType() instanceof IntegerType;
|
||||
if (newType instanceof LongType || newType instanceof FloatType
|
||||
|| newType instanceof DoubleType || newType instanceof StringType || newType instanceof DecimalType) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (oldV.isNullAt(i)) {
|
||||
newV.putNull(i);
|
||||
continue;
|
||||
}
|
||||
// int/long -> long/float/double/string/decimal
|
||||
if (newType instanceof LongType) {
|
||||
newV.putLong(i, isInt ? oldV.getInt(i) : oldV.getLong(i));
|
||||
} else if (newType instanceof FloatType) {
|
||||
newV.putFloat(i, isInt ? oldV.getInt(i) : oldV.getLong(i));
|
||||
} else if (newType instanceof DoubleType) {
|
||||
newV.putDouble(i, isInt ? oldV.getInt(i) : oldV.getLong(i));
|
||||
} else if (newType instanceof StringType) {
|
||||
newV.putByteArray(i, ((isInt ? oldV.getInt(i) : oldV.getLong(i)) + "").getBytes(StandardCharsets.UTF_8));
|
||||
} else if (newType instanceof DecimalType) {
|
||||
Decimal oldDecimal = Decimal.apply(isInt ? oldV.getInt(i) : oldV.getLong(i));
|
||||
oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale());
|
||||
newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision());
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert float type to other Type.
|
||||
* Now only support float -> double/String
|
||||
* TODO: support more types
|
||||
*/
|
||||
private static boolean convertFloatType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) {
|
||||
if (newType instanceof DoubleType || newType instanceof StringType || newType instanceof DecimalType) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (oldV.isNullAt(i)) {
|
||||
newV.putNull(i);
|
||||
continue;
|
||||
}
|
||||
// float -> double/string/decimal
|
||||
if (newType instanceof DoubleType) {
|
||||
newV.putDouble(i, Double.valueOf(oldV.getFloat(i) + ""));
|
||||
} else if (newType instanceof StringType) {
|
||||
newV.putByteArray(i, (oldV.getFloat(i) + "").getBytes(StandardCharsets.UTF_8));
|
||||
} else if (newType instanceof DecimalType) {
|
||||
Decimal oldDecimal = Decimal.apply(oldV.getFloat(i));
|
||||
oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale());
|
||||
newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision());
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert double type to other Type.
|
||||
* Now only support Double -> Decimal/String
|
||||
* TODO: support more types
|
||||
*/
|
||||
private static boolean convertDoubleType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) {
|
||||
if (newType instanceof DecimalType || newType instanceof StringType) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (oldV.isNullAt(i)) {
|
||||
newV.putNull(i);
|
||||
continue;
|
||||
}
|
||||
// double -> decimal/string
|
||||
if (newType instanceof DecimalType) {
|
||||
Decimal oldDecimal = Decimal.apply(oldV.getDouble(i));
|
||||
oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale());
|
||||
newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision());
|
||||
} else if (newType instanceof StringType) {
|
||||
newV.putByteArray(i, (oldV.getDouble(i) + "").getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert decimal type to other Type.
|
||||
* Now only support Decimal -> Decimal/String
|
||||
* TODO: support more types
|
||||
*/
|
||||
private static boolean convertDecimalType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) {
|
||||
DataType oldType = oldV.dataType();
|
||||
if (newType instanceof DecimalType || newType instanceof StringType) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (oldV.isNullAt(i)) {
|
||||
newV.putNull(i);
|
||||
continue;
|
||||
}
|
||||
Decimal oldDecimal = oldV.getDecimal(i, ((DecimalType) oldType).precision(), ((DecimalType) oldType).scale());
|
||||
if (newType instanceof DecimalType) {
|
||||
oldDecimal.changePrecision(((DecimalType) newType).precision(), ((DecimalType) newType).scale());
|
||||
newV.putDecimal(i, oldDecimal, ((DecimalType) newType).precision());
|
||||
} else if (newType instanceof StringType) {
|
||||
newV.putByteArray(i, oldDecimal.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert date type to other Type.
|
||||
* Now only support Date -> String
|
||||
* TODO: support more types
|
||||
*/
|
||||
private static boolean convertDateType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) {
|
||||
if (newType instanceof StringType) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (oldV.isNullAt(i)) {
|
||||
newV.putNull(i);
|
||||
continue;
|
||||
}
|
||||
// to do support rebaseDate
|
||||
String res = org.apache.spark.sql.catalyst.util.DateTimeUtils.toJavaDate(oldV.getInt(i)).toString();
|
||||
newV.putByteArray(i, res.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert String type to other Type.
|
||||
* Now only support String -> Decimal/Date.
|
||||
* Notice: This convert maybe failed!!!
|
||||
* TODO: support more types
|
||||
*/
|
||||
private static boolean convertStringType(WritableColumnVector oldV, WritableColumnVector newV, DataType newType, int len) {
|
||||
if (newType instanceof DateType || newType instanceof DecimalType) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (oldV.isNullAt(i)) {
|
||||
newV.putNull(i);
|
||||
continue;
|
||||
}
|
||||
// to do support rebaseDate
|
||||
if (newType instanceof DateType) {
|
||||
int days = org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaDate(Date.valueOf(oldV.getUTF8String(i).toString()));
|
||||
newV.putInt(i, days);
|
||||
} else if (newType instanceof DecimalType) {
|
||||
DecimalType decimalType = (DecimalType) newType;
|
||||
java.math.BigDecimal bigDecimal = new java.math.BigDecimal(oldV.getUTF8String(i).toString().trim());
|
||||
Decimal sparkDecimal = Decimal.apply(bigDecimal);
|
||||
sparkDecimal.changePrecision(decimalType.precision(), decimalType.scale());
|
||||
newV.putDecimal(i, sparkDecimal, decimalType.precision());
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean convertColumnVectorType(WritableColumnVector oldV, WritableColumnVector newV, int len) {
|
||||
if (len == 0 || oldV == null || newV == null) {
|
||||
return false;
|
||||
}
|
||||
DataType oldType = oldV.dataType(); // old colType eg: floatType
|
||||
DataType newType = newV.dataType(); // new colType eg: doubleType
|
||||
if (oldV != null && newType != null) {
|
||||
if (oldType instanceof BooleanType) {
|
||||
return false;
|
||||
} else if (oldType instanceof ByteType) {
|
||||
return false;
|
||||
} else if (oldType instanceof ShortType) {
|
||||
return false;
|
||||
} else if (oldType instanceof IntegerType) {
|
||||
return convertIntLongType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof LongType) {
|
||||
return convertIntLongType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof FloatType) {
|
||||
return convertFloatType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof DoubleType) {
|
||||
return convertDoubleType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof StringType) {
|
||||
return convertStringType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof BinaryType) {
|
||||
return false;
|
||||
} else if (oldType instanceof DecimalType) {
|
||||
return convertDecimalType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof DateType) {
|
||||
return convertDateType(oldV, newV, newType, len);
|
||||
} else if (oldType instanceof TimestampType) {
|
||||
return false;
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Datatype not supported " + oldV);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,8 +37,12 @@ import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFile
|
||||
import org.apache.spark.sql.sources._
|
||||
import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
import java.util.Properties
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema
|
||||
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter
|
||||
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object HoodieSparkUtils extends SparkAdapterSupport {
|
||||
@@ -55,6 +59,10 @@ object HoodieSparkUtils extends SparkAdapterSupport {
|
||||
|
||||
def gteqSpark3_2: Boolean = SPARK_VERSION > "3.2"
|
||||
|
||||
def gteqSpark3_1: Boolean = SPARK_VERSION > "3.1"
|
||||
|
||||
def gteqSpark3_1_3: Boolean = SPARK_VERSION >= "3.1.3"
|
||||
|
||||
def getMetaSchema: StructType = {
|
||||
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
|
||||
StructField(col, StringType, nullable = true)
|
||||
@@ -302,17 +310,25 @@ object HoodieSparkUtils extends SparkAdapterSupport {
|
||||
AttributeReference(columnName, field.get.dataType, field.get.nullable)()
|
||||
}
|
||||
|
||||
def getRequiredSchema(tableAvroSchema: Schema, requiredColumns: Array[String]): (Schema, StructType) = {
|
||||
// First get the required avro-schema, then convert the avro-schema to spark schema.
|
||||
val name2Fields = tableAvroSchema.getFields.asScala.map(f => f.name() -> f).toMap
|
||||
// Here have to create a new Schema.Field object
|
||||
// to prevent throwing exceptions like "org.apache.avro.AvroRuntimeException: Field already used".
|
||||
val requiredFields = requiredColumns.map(c => name2Fields(c))
|
||||
.map(f => new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal(), f.order())).toList
|
||||
val requiredAvroSchema = Schema.createRecord(tableAvroSchema.getName, tableAvroSchema.getDoc,
|
||||
tableAvroSchema.getNamespace, tableAvroSchema.isError, requiredFields.asJava)
|
||||
val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema)
|
||||
(requiredAvroSchema, requiredStructSchema)
|
||||
def getRequiredSchema(tableAvroSchema: Schema, requiredColumns: Array[String], internalSchema: InternalSchema = InternalSchema.getEmptyInternalSchema): (Schema, StructType, InternalSchema) = {
|
||||
if (internalSchema.isEmptySchema || requiredColumns.isEmpty) {
|
||||
// First get the required avro-schema, then convert the avro-schema to spark schema.
|
||||
val name2Fields = tableAvroSchema.getFields.asScala.map(f => f.name() -> f).toMap
|
||||
// Here have to create a new Schema.Field object
|
||||
// to prevent throwing exceptions like "org.apache.avro.AvroRuntimeException: Field already used".
|
||||
val requiredFields = requiredColumns.map(c => name2Fields(c))
|
||||
.map(f => new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal(), f.order())).toList
|
||||
val requiredAvroSchema = Schema.createRecord(tableAvroSchema.getName, tableAvroSchema.getDoc,
|
||||
tableAvroSchema.getNamespace, tableAvroSchema.isError, requiredFields.asJava)
|
||||
val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema)
|
||||
(requiredAvroSchema, requiredStructSchema, internalSchema)
|
||||
} else {
|
||||
// now we support nested project
|
||||
val prunedInternalSchema = InternalSchemaUtils.pruneInternalSchema(internalSchema, requiredColumns.toList.asJava)
|
||||
val requiredAvroSchema = AvroInternalSchemaConverter.convert(prunedInternalSchema, tableAvroSchema.getName)
|
||||
val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema)
|
||||
(requiredAvroSchema, requiredStructSchema, prunedInternalSchema)
|
||||
}
|
||||
}
|
||||
|
||||
def toAttribute(tableSchema: StructType): Seq[AttributeReference] = {
|
||||
|
||||
@@ -33,6 +33,9 @@ import org.apache.spark.sql.execution.datasources.{FilePartition, LogicalRelatio
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.types.DataType
|
||||
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, Row, SparkSession}
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
import org.apache.spark.sql.catalyst.rules.Rule
|
||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
|
||||
|
||||
import java.util.Locale
|
||||
|
||||
@@ -167,4 +170,14 @@ trait SparkAdapter extends Serializable {
|
||||
other
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create customresolutionRule to deal with alter command for hudi.
|
||||
*/
|
||||
def createResolveHudiAlterTableCommand(sparkSession: SparkSession): Rule[LogicalPlan]
|
||||
|
||||
/**
|
||||
* Create hoodie parquet file format.
|
||||
*/
|
||||
def createHoodieParquetFileFormat(): Option[ParquetFileFormat]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user