[RFC-33] [HUDI-2429][Stacked on HUDI-2560] Support full Schema evolution for Spark (#4910)
* [HUDI-2560] introduce id_based schema to support full schema evolution. * add test for FileBasedInternalSchemaStorageManger and rebase code * add support for change column type and fix some test case * fix some bugs encountered in the production env and delete useless code * fix test error * rebase code * fixed some nested schema change bugs * [HUDI-2429][Stacked On HUDI-2560]Support full schema evolution for spark * [use dummyInternalSchema instead of null] * add support for spark3.1.x * remove support for spark3.1.x , sicne some compile fail * support spark3.1.x * rebase and prepare solve all comments * address all comments * rebase code * fixed the count(*) bug * try to get internalSchema by parser commit file/history file directly, not use metaclient which is time cost address some comments * fixed all comments * fix new comments * rebase code,fix UT failed * fixed mistake * rebase code ,fixed new comments * rebase code , and prepare for address new comments * address commits * address new comments * fix new issues * control fallback original write logical
This commit is contained in:
@@ -19,6 +19,8 @@
|
||||
package org.apache.hudi.avro;
|
||||
|
||||
import org.apache.avro.AvroRuntimeException;
|
||||
import org.apache.avro.SchemaCompatibility;
|
||||
import org.apache.avro.Conversions;
|
||||
import org.apache.avro.Conversions.DecimalConversion;
|
||||
import org.apache.avro.JsonProperties;
|
||||
import org.apache.avro.LogicalTypes;
|
||||
@@ -55,16 +57,26 @@ import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.math.BigDecimal;
|
||||
import java.math.BigInteger;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.Date;
|
||||
import java.sql.Timestamp;
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.TimeZone;
|
||||
import java.util.Iterator;
|
||||
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.avro.Schema.Type.UNION;
|
||||
|
||||
/**
|
||||
* Helper class to do common stuff across Avro.
|
||||
*/
|
||||
@@ -74,6 +86,11 @@ public class HoodieAvroUtils {
|
||||
|
||||
private static ThreadLocal<BinaryDecoder> reuseDecoder = ThreadLocal.withInitial(() -> null);
|
||||
|
||||
private static final long MILLIS_PER_DAY = 86400000L;
|
||||
|
||||
//Export for test
|
||||
public static final Conversions.DecimalConversion DECIMAL_CONVERSION = new Conversions.DecimalConversion();
|
||||
|
||||
// As per https://avro.apache.org/docs/current/spec.html#names
|
||||
private static String INVALID_AVRO_CHARS_IN_NAMES = "[^A-Za-z0-9_]";
|
||||
private static String INVALID_AVRO_FIRST_CHAR_IN_NAMES = "[^A-Za-z_]";
|
||||
@@ -655,4 +672,271 @@ public class HoodieAvroUtils {
|
||||
|
||||
return nonNullType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new schema.
|
||||
* support deep rewrite for nested record.
|
||||
* This particular method does the following things :
|
||||
* a) Create a new empty GenericRecord with the new schema.
|
||||
* b) For GenericRecord, copy over the data from the old schema to the new schema or set default values for all fields of this transformed schema
|
||||
*
|
||||
* @param oldRecord oldRecord to be rewritten
|
||||
* @param newSchema newSchema used to rewrite oldRecord
|
||||
* @return newRecord for new Schema
|
||||
*/
|
||||
public static GenericRecord rewriteRecordWithNewSchema(IndexedRecord oldRecord, Schema newSchema) {
|
||||
Object newRecord = rewriteRecordWithNewSchema(oldRecord, oldRecord.getSchema(), newSchema);
|
||||
return (GenericData.Record) newRecord;
|
||||
}
|
||||
|
||||
private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSchema, Schema newSchema) {
|
||||
if (oldRecord == null) {
|
||||
return null;
|
||||
}
|
||||
switch (newSchema.getType()) {
|
||||
case RECORD:
|
||||
if (!(oldRecord instanceof IndexedRecord)) {
|
||||
throw new IllegalArgumentException("cannot rewrite record with different type");
|
||||
}
|
||||
IndexedRecord indexedRecord = (IndexedRecord) oldRecord;
|
||||
List<Schema.Field> fields = newSchema.getFields();
|
||||
Map<Integer, Object> helper = new HashMap<>();
|
||||
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
Schema.Field field = fields.get(i);
|
||||
if (oldSchema.getField(field.name()) != null) {
|
||||
Schema.Field oldField = oldSchema.getField(field.name());
|
||||
helper.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema()));
|
||||
}
|
||||
}
|
||||
GenericData.Record newRecord = new GenericData.Record(newSchema);
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
if (helper.containsKey(i)) {
|
||||
newRecord.put(i, helper.get(i));
|
||||
} else {
|
||||
if (fields.get(i).defaultVal() instanceof JsonProperties.Null) {
|
||||
newRecord.put(i, null);
|
||||
} else {
|
||||
newRecord.put(i, fields.get(i).defaultVal());
|
||||
}
|
||||
}
|
||||
}
|
||||
return newRecord;
|
||||
case ARRAY:
|
||||
if (!(oldRecord instanceof Collection)) {
|
||||
throw new IllegalArgumentException("cannot rewrite record with different type");
|
||||
}
|
||||
Collection array = (Collection)oldRecord;
|
||||
List<Object> newArray = new ArrayList();
|
||||
for (Object element : array) {
|
||||
newArray.add(rewriteRecordWithNewSchema(element, oldSchema.getElementType(), newSchema.getElementType()));
|
||||
}
|
||||
return newArray;
|
||||
case MAP:
|
||||
if (!(oldRecord instanceof Map)) {
|
||||
throw new IllegalArgumentException("cannot rewrite record with different type");
|
||||
}
|
||||
Map<Object, Object> map = (Map<Object, Object>) oldRecord;
|
||||
Map<Object, Object> newMap = new HashMap<>();
|
||||
for (Map.Entry<Object, Object> entry : map.entrySet()) {
|
||||
newMap.put(entry.getKey(), rewriteRecordWithNewSchema(entry.getValue(), oldSchema.getValueType(), newSchema.getValueType()));
|
||||
}
|
||||
return newMap;
|
||||
case UNION:
|
||||
return rewriteRecordWithNewSchema(oldRecord, getActualSchemaFromUnion(oldSchema, oldRecord), getActualSchemaFromUnion(newSchema, oldRecord));
|
||||
default:
|
||||
return rewritePrimaryType(oldRecord, oldSchema, newSchema);
|
||||
}
|
||||
}
|
||||
|
||||
private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Schema newSchema) {
|
||||
Schema realOldSchema = oldSchema;
|
||||
if (realOldSchema.getType() == UNION) {
|
||||
realOldSchema = getActualSchemaFromUnion(oldSchema, oldValue);
|
||||
}
|
||||
if (realOldSchema.getType() == newSchema.getType()) {
|
||||
switch (realOldSchema.getType()) {
|
||||
case NULL:
|
||||
case BOOLEAN:
|
||||
case INT:
|
||||
case LONG:
|
||||
case FLOAT:
|
||||
case DOUBLE:
|
||||
case BYTES:
|
||||
case STRING:
|
||||
return oldValue;
|
||||
case FIXED:
|
||||
// fixed size and name must match:
|
||||
if (!SchemaCompatibility.schemaNameEquals(realOldSchema, newSchema) || realOldSchema.getFixedSize() != newSchema.getFixedSize()) {
|
||||
// deal with the precision change for decimalType
|
||||
if (realOldSchema.getLogicalType() instanceof LogicalTypes.Decimal) {
|
||||
final byte[] bytes;
|
||||
bytes = ((GenericFixed) oldValue).bytes();
|
||||
LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) realOldSchema.getLogicalType();
|
||||
BigDecimal bd = new BigDecimal(new BigInteger(bytes), decimal.getScale()).setScale(((LogicalTypes.Decimal) newSchema.getLogicalType()).getScale());
|
||||
return DECIMAL_CONVERSION.toFixed(bd, newSchema, newSchema.getLogicalType());
|
||||
}
|
||||
} else {
|
||||
return oldValue;
|
||||
}
|
||||
return oldValue;
|
||||
default:
|
||||
throw new AvroRuntimeException("Unknown schema type: " + newSchema.getType());
|
||||
}
|
||||
} else {
|
||||
return rewritePrimaryTypeWithDiffSchemaType(oldValue, realOldSchema, newSchema);
|
||||
}
|
||||
}
|
||||
|
||||
private static Object rewritePrimaryTypeWithDiffSchemaType(Object oldValue, Schema oldSchema, Schema newSchema) {
|
||||
switch (newSchema.getType()) {
|
||||
case NULL:
|
||||
case BOOLEAN:
|
||||
break;
|
||||
case INT:
|
||||
if (newSchema.getLogicalType() == LogicalTypes.date() && oldSchema.getType() == Schema.Type.STRING) {
|
||||
return fromJavaDate(java.sql.Date.valueOf(oldValue.toString()));
|
||||
}
|
||||
break;
|
||||
case LONG:
|
||||
if (oldSchema.getType() == Schema.Type.INT) {
|
||||
return ((Integer) oldValue).longValue();
|
||||
}
|
||||
break;
|
||||
case FLOAT:
|
||||
if ((oldSchema.getType() == Schema.Type.INT)
|
||||
|| (oldSchema.getType() == Schema.Type.LONG)) {
|
||||
return oldSchema.getType() == Schema.Type.INT ? ((Integer) oldValue).floatValue() : ((Long) oldValue).floatValue();
|
||||
}
|
||||
break;
|
||||
case DOUBLE:
|
||||
if (oldSchema.getType() == Schema.Type.FLOAT) {
|
||||
// java float cannot convert to double directly, deal with float precision change
|
||||
return Double.valueOf(oldValue + "");
|
||||
} else if (oldSchema.getType() == Schema.Type.INT) {
|
||||
return ((Integer) oldValue).doubleValue();
|
||||
} else if (oldSchema.getType() == Schema.Type.LONG) {
|
||||
return ((Long) oldValue).doubleValue();
|
||||
}
|
||||
break;
|
||||
case BYTES:
|
||||
if (oldSchema.getType() == Schema.Type.STRING) {
|
||||
return (oldValue.toString()).getBytes(StandardCharsets.UTF_8);
|
||||
}
|
||||
break;
|
||||
case STRING:
|
||||
if (oldSchema.getType() == Schema.Type.BYTES) {
|
||||
return String.valueOf(((byte[]) oldValue));
|
||||
}
|
||||
if (oldSchema.getLogicalType() == LogicalTypes.date()) {
|
||||
return toJavaDate((Integer) oldValue).toString();
|
||||
}
|
||||
if (oldSchema.getType() == Schema.Type.INT
|
||||
|| oldSchema.getType() == Schema.Type.LONG
|
||||
|| oldSchema.getType() == Schema.Type.FLOAT
|
||||
|| oldSchema.getType() == Schema.Type.DOUBLE) {
|
||||
return oldValue.toString();
|
||||
}
|
||||
if (oldSchema.getType() == Schema.Type.FIXED && oldSchema.getLogicalType() instanceof LogicalTypes.Decimal) {
|
||||
final byte[] bytes;
|
||||
bytes = ((GenericFixed) oldValue).bytes();
|
||||
LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) oldSchema.getLogicalType();
|
||||
BigDecimal bd = new BigDecimal(new BigInteger(bytes), decimal.getScale());
|
||||
return bd.toString();
|
||||
}
|
||||
break;
|
||||
case FIXED:
|
||||
// deal with decimal Type
|
||||
if (newSchema.getLogicalType() instanceof LogicalTypes.Decimal) {
|
||||
// TODO: support more types
|
||||
if (oldSchema.getType() == Schema.Type.STRING
|
||||
|| oldSchema.getType() == Schema.Type.DOUBLE
|
||||
|| oldSchema.getType() == Schema.Type.INT
|
||||
|| oldSchema.getType() == Schema.Type.LONG
|
||||
|| oldSchema.getType() == Schema.Type.FLOAT) {
|
||||
LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) newSchema.getLogicalType();
|
||||
BigDecimal bigDecimal = null;
|
||||
if (oldSchema.getType() == Schema.Type.STRING) {
|
||||
bigDecimal = new java.math.BigDecimal(oldValue.toString())
|
||||
.setScale(decimal.getScale());
|
||||
} else {
|
||||
// Due to Java, there will be precision problems in direct conversion, we should use string instead of use double
|
||||
bigDecimal = new java.math.BigDecimal(oldValue.toString())
|
||||
.setScale(decimal.getScale());
|
||||
}
|
||||
return DECIMAL_CONVERSION.toFixed(bigDecimal, newSchema, newSchema.getLogicalType());
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
}
|
||||
throw new AvroRuntimeException(String.format("cannot support rewrite value for schema type: %s since the old schema type is: %s", newSchema, oldSchema));
|
||||
}
|
||||
|
||||
// convert days to Date
|
||||
private static java.sql.Date toJavaDate(int days) {
|
||||
long localMillis = Math.multiplyExact(days, MILLIS_PER_DAY);
|
||||
int timeZoneOffset;
|
||||
TimeZone defaultTimeZone = TimeZone.getDefault();
|
||||
if (defaultTimeZone instanceof sun.util.calendar.ZoneInfo) {
|
||||
timeZoneOffset = ((sun.util.calendar.ZoneInfo) defaultTimeZone).getOffsetsByWall(localMillis, null);
|
||||
} else {
|
||||
timeZoneOffset = defaultTimeZone.getOffset(localMillis - defaultTimeZone.getRawOffset());
|
||||
}
|
||||
return new java.sql.Date(localMillis - timeZoneOffset);
|
||||
}
|
||||
|
||||
// convert Date to days
|
||||
private static int fromJavaDate(Date date) {
|
||||
long millisUtc = date.getTime();
|
||||
long millisLocal = millisUtc + TimeZone.getDefault().getOffset(millisUtc);
|
||||
int julianDays = Math.toIntExact(Math.floorDiv(millisLocal, MILLIS_PER_DAY));
|
||||
return julianDays;
|
||||
}
|
||||
|
||||
private static Schema getActualSchemaFromUnion(Schema schema, Object data) {
|
||||
Schema actualSchema;
|
||||
if (!schema.getType().equals(UNION)) {
|
||||
return schema;
|
||||
}
|
||||
if (schema.getTypes().size() == 2
|
||||
&& schema.getTypes().get(0).getType() == Schema.Type.NULL) {
|
||||
actualSchema = schema.getTypes().get(1);
|
||||
} else if (schema.getTypes().size() == 2
|
||||
&& schema.getTypes().get(1).getType() == Schema.Type.NULL) {
|
||||
actualSchema = schema.getTypes().get(0);
|
||||
} else if (schema.getTypes().size() == 1) {
|
||||
actualSchema = schema.getTypes().get(0);
|
||||
} else {
|
||||
// deal complex union. this should not happened in hoodie,
|
||||
// since flink/spark do not write this type.
|
||||
int i = GenericData.get().resolveUnion(schema, data);
|
||||
actualSchema = schema.getTypes().get(i);
|
||||
}
|
||||
return actualSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given avro records, rewrites them with new schema.
|
||||
*
|
||||
* @param oldRecords oldRecords to be rewrite
|
||||
* @param newSchema newSchema used to rewrite oldRecord
|
||||
* @return a iterator of rewrote GeneriRcords
|
||||
*/
|
||||
public static Iterator<GenericRecord> rewriteRecordWithNewSchema(Iterator<GenericRecord> oldRecords, Schema newSchema) {
|
||||
if (oldRecords == null || newSchema == null) {
|
||||
return Collections.emptyIterator();
|
||||
}
|
||||
return new Iterator<GenericRecord>() {
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return oldRecords.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenericRecord next() {
|
||||
return rewriteRecordWithNewSchema(oldRecords.next(), newSchema);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +50,10 @@ public enum WriteOperationType {
|
||||
COMPACT("compact"),
|
||||
|
||||
INDEX("index"),
|
||||
|
||||
// alter schema
|
||||
ALTER_SCHEMA("alter_schema"),
|
||||
|
||||
// used for old version
|
||||
UNKNOWN("unknown");
|
||||
|
||||
@@ -90,6 +94,8 @@ public enum WriteOperationType {
|
||||
return COMPACT;
|
||||
case "index":
|
||||
return INDEX;
|
||||
case "alter_schema":
|
||||
return ALTER_SCHEMA;
|
||||
case "unknown":
|
||||
return UNKNOWN;
|
||||
default:
|
||||
|
||||
@@ -90,6 +90,8 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR
|
||||
+ ".fileids";
|
||||
|
||||
public static final String SCHEMA_FOLDER_NAME = ".schema";
|
||||
|
||||
public static final String MARKER_EXTN = ".marker";
|
||||
|
||||
private String basePath;
|
||||
@@ -192,6 +194,13 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
return new Path(metaPath, COLUMN_STATISTICS_INDEX_NAME).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return schema folder path
|
||||
*/
|
||||
public String getSchemaFolderName() {
|
||||
return new Path(metaPath, SCHEMA_FOLDER_NAME).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Temp Folder path
|
||||
*/
|
||||
@@ -392,6 +401,11 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
if (!fs.exists(metaPathDir)) {
|
||||
fs.mkdirs(metaPathDir);
|
||||
}
|
||||
// create schema folder
|
||||
Path schemaPathDir = new Path(metaPathDir, SCHEMA_FOLDER_NAME);
|
||||
if (!fs.exists(schemaPathDir)) {
|
||||
fs.mkdirs(schemaPathDir);
|
||||
}
|
||||
|
||||
// if anything other than default archive log folder is specified, create that too
|
||||
String archiveLogPropVal = new HoodieConfig(props).getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER);
|
||||
|
||||
@@ -18,13 +18,6 @@
|
||||
|
||||
package org.apache.hudi.common.table;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.SchemaCompatibility;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
@@ -44,8 +37,18 @@ import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.InvalidTableException;
|
||||
import org.apache.hudi.io.storage.HoodieHFileReader;
|
||||
|
||||
import org.apache.hudi.io.storage.HoodieOrcReader;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
|
||||
import org.apache.hudi.internal.schema.utils.SerDeHelper;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.SchemaCompatibility;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
@@ -534,4 +537,51 @@ public class TableSchemaResolver {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant.
|
||||
*
|
||||
* @return InternalSchema for this table
|
||||
*/
|
||||
public Option<InternalSchema> getTableInternalSchemaFromCommitMetadata() {
|
||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
if (timeline.lastInstant().isPresent()) {
|
||||
return getTableInternalSchemaFromCommitMetadata(timeline.lastInstant().get());
|
||||
} else {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant.
|
||||
*
|
||||
* @return InternalSchema for this table
|
||||
*/
|
||||
private Option<InternalSchema> getTableInternalSchemaFromCommitMetadata(HoodieInstant instant) {
|
||||
try {
|
||||
HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedInstants();
|
||||
byte[] data = timeline.getInstantDetails(instant).get();
|
||||
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
||||
String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
|
||||
if (latestInternalSchemaStr != null) {
|
||||
return SerDeHelper.fromJson(latestInternalSchemaStr);
|
||||
} else {
|
||||
return Option.empty();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Failed to read schema from commit metadata", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the history schemas as String for a hoodie table from the HoodieCommitMetadata of the instant.
|
||||
*
|
||||
* @return history schemas string for this table
|
||||
*/
|
||||
public Option<String> getTableHistorySchemaStrFromCommitMetadata() {
|
||||
// now we only support FileBaseInternalSchemaManager
|
||||
FileBasedInternalSchemaStorageManager manager = new FileBasedInternalSchemaStorageManager(metaClient);
|
||||
String result = manager.getHistorySchemaStr();
|
||||
return result.isEmpty() ? Option.empty() : Option.of(result);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
package org.apache.hudi.common.table.log;
|
||||
|
||||
import org.apache.hudi.common.model.DeleteRecord;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
@@ -36,6 +37,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.ClosableIterator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.SpillableMapUtils;
|
||||
import org.apache.hudi.common.util.InternalSchemaCache;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
@@ -46,6 +48,9 @@ import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.action.InternalSchemaMerger;
|
||||
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -109,6 +114,10 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
private final FileSystem fs;
|
||||
// Total log files read - for metrics
|
||||
private AtomicLong totalLogFiles = new AtomicLong(0);
|
||||
// Internal schema, used to support full schema evolution.
|
||||
private InternalSchema internalSchema;
|
||||
// Hoodie table path.
|
||||
private final String path;
|
||||
// Total log blocks read - for metrics
|
||||
private AtomicLong totalLogBlocks = new AtomicLong(0);
|
||||
// Total log records read - for metrics
|
||||
@@ -135,14 +144,14 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
int bufferSize, Option<InstantRange> instantRange,
|
||||
boolean withOperationField) {
|
||||
this(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize,
|
||||
instantRange, withOperationField, true, Option.empty());
|
||||
instantRange, withOperationField, true, Option.empty(), InternalSchema.getEmptyInternalSchema());
|
||||
}
|
||||
|
||||
protected AbstractHoodieLogRecordReader(FileSystem fs, String basePath, List<String> logFilePaths,
|
||||
Schema readerSchema, String latestInstantTime, boolean readBlocksLazily,
|
||||
boolean reverseReader, int bufferSize, Option<InstantRange> instantRange,
|
||||
boolean withOperationField, boolean enableFullScan,
|
||||
Option<String> partitionName) {
|
||||
Option<String> partitionName, InternalSchema internalSchema) {
|
||||
this.readerSchema = readerSchema;
|
||||
this.latestInstantTime = latestInstantTime;
|
||||
this.hoodieTableMetaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).build();
|
||||
@@ -159,6 +168,8 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
this.instantRange = instantRange;
|
||||
this.withOperationField = withOperationField;
|
||||
this.enableFullScan = enableFullScan;
|
||||
this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema;
|
||||
this.path = basePath;
|
||||
|
||||
// Key fields when populate meta fields is disabled (that is, virtual keys enabled)
|
||||
if (!tableConfig.populateMetaFields()) {
|
||||
@@ -202,7 +213,7 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
// Iterate over the paths
|
||||
logFormatReaderWrapper = new HoodieLogFormatReader(fs,
|
||||
logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()),
|
||||
readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan, keyField);
|
||||
readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan, keyField, internalSchema);
|
||||
Set<HoodieLogFile> scannedLogFiles = new HashSet<>();
|
||||
while (logFormatReaderWrapper.hasNext()) {
|
||||
HoodieLogFile logFile = logFormatReaderWrapper.getLogFile();
|
||||
@@ -361,8 +372,10 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
*/
|
||||
private void processDataBlock(HoodieDataBlock dataBlock, Option<List<String>> keys) throws Exception {
|
||||
try (ClosableIterator<IndexedRecord> recordItr = dataBlock.getRecordItr(keys.orElse(Collections.emptyList()))) {
|
||||
Option<Schema> schemaOption = getMergedSchema(dataBlock);
|
||||
while (recordItr.hasNext()) {
|
||||
IndexedRecord record = recordItr.next();
|
||||
IndexedRecord currentRecord = recordItr.next();
|
||||
IndexedRecord record = schemaOption.isPresent() ? HoodieAvroUtils.rewriteRecordWithNewSchema(currentRecord, schemaOption.get()) : currentRecord;
|
||||
processNextRecord(createHoodieRecord(record, this.hoodieTableMetaClient.getTableConfig(), this.payloadClassFQN,
|
||||
this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName));
|
||||
totalLogRecords.incrementAndGet();
|
||||
@@ -370,6 +383,28 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get final Read Schema for support evolution.
|
||||
* step1: find the fileSchema for current dataBlock.
|
||||
* step2: determine whether fileSchema is compatible with the final read internalSchema.
|
||||
* step3: merge fileSchema and read internalSchema to produce final read schema.
|
||||
*
|
||||
* @param dataBlock current processed block
|
||||
* @return final read schema.
|
||||
*/
|
||||
private Option<Schema> getMergedSchema(HoodieDataBlock dataBlock) {
|
||||
Option<Schema> result = Option.empty();
|
||||
if (!internalSchema.isEmptySchema()) {
|
||||
Long currentInstantTime = Long.parseLong(dataBlock.getLogBlockHeader().get(INSTANT_TIME));
|
||||
InternalSchema fileSchema = InternalSchemaCache
|
||||
.searchSchemaAndCache(currentInstantTime, hoodieTableMetaClient, false);
|
||||
Schema mergeSchema = AvroInternalSchemaConverter
|
||||
.convert(new InternalSchemaMerger(fileSchema, internalSchema, true, false).mergeSchema(), readerSchema.getName());
|
||||
result = Option.of(mergeSchema);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create @{@link HoodieRecord} from the @{@link IndexedRecord}.
|
||||
*
|
||||
|
||||
@@ -36,6 +36,7 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.CorruptedLogFileException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@@ -44,6 +45,7 @@ import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -74,6 +76,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
private final HoodieLogFile logFile;
|
||||
private final byte[] magicBuffer = new byte[6];
|
||||
private final Schema readerSchema;
|
||||
private InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema();
|
||||
private final String keyField;
|
||||
private boolean readBlockLazily;
|
||||
private long reverseLogFilePosition;
|
||||
@@ -97,6 +100,12 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||
boolean readBlockLazily, boolean reverseReader, boolean enableRecordLookups,
|
||||
String keyField) throws IOException {
|
||||
this(fs, logFile, readerSchema, bufferSize, readBlockLazily, reverseReader, enableRecordLookups, keyField, InternalSchema.getEmptyInternalSchema());
|
||||
}
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||
boolean readBlockLazily, boolean reverseReader, boolean enableRecordLookups,
|
||||
String keyField, InternalSchema internalSchema) throws IOException {
|
||||
this.hadoopConf = fs.getConf();
|
||||
// NOTE: We repackage {@code HoodieLogFile} here to make sure that the provided path
|
||||
// is prefixed with an appropriate scheme given that we're not propagating the FS
|
||||
@@ -108,6 +117,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
this.reverseReader = reverseReader;
|
||||
this.enableRecordLookups = enableRecordLookups;
|
||||
this.keyField = keyField;
|
||||
this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema;
|
||||
if (this.reverseReader) {
|
||||
this.reverseLogFilePosition = this.lastReverseLogFilePosition = this.logFile.getFileSize();
|
||||
}
|
||||
@@ -197,10 +207,10 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
switch (Objects.requireNonNull(blockType)) {
|
||||
case AVRO_DATA_BLOCK:
|
||||
if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
|
||||
return HoodieAvroDataBlock.getBlock(content.get(), readerSchema);
|
||||
return HoodieAvroDataBlock.getBlock(content.get(), readerSchema, internalSchema);
|
||||
} else {
|
||||
return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc,
|
||||
Option.ofNullable(readerSchema), header, footer, keyField);
|
||||
Option.ofNullable(readerSchema), header, footer, keyField, internalSchema);
|
||||
}
|
||||
|
||||
case HFILE_DATA_BLOCK:
|
||||
|
||||
@@ -24,6 +24,7 @@ import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -42,6 +43,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
private HoodieLogFileReader currentReader;
|
||||
private final FileSystem fs;
|
||||
private final Schema readerSchema;
|
||||
private InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema();
|
||||
private final boolean readBlocksLazily;
|
||||
private final boolean reverseLogReader;
|
||||
private final String recordKeyField;
|
||||
@@ -53,6 +55,12 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
HoodieLogFormatReader(FileSystem fs, List<HoodieLogFile> logFiles, Schema readerSchema, boolean readBlocksLazily,
|
||||
boolean reverseLogReader, int bufferSize, boolean enableInlineReading,
|
||||
String recordKeyField) throws IOException {
|
||||
this(fs, logFiles, readerSchema, readBlocksLazily, reverseLogReader, bufferSize, enableInlineReading, recordKeyField, InternalSchema.getEmptyInternalSchema());
|
||||
}
|
||||
|
||||
HoodieLogFormatReader(FileSystem fs, List<HoodieLogFile> logFiles, Schema readerSchema, boolean readBlocksLazily,
|
||||
boolean reverseLogReader, int bufferSize, boolean enableInlineReading,
|
||||
String recordKeyField, InternalSchema internalSchema) throws IOException {
|
||||
this.logFiles = logFiles;
|
||||
this.fs = fs;
|
||||
this.readerSchema = readerSchema;
|
||||
@@ -62,10 +70,11 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
this.prevReadersInOpenState = new ArrayList<>();
|
||||
this.recordKeyField = recordKeyField;
|
||||
this.enableInlineReading = enableInlineReading;
|
||||
this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema;
|
||||
if (logFiles.size() > 0) {
|
||||
HoodieLogFile nextLogFile = logFiles.remove(0);
|
||||
this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
|
||||
enableInlineReading, recordKeyField);
|
||||
enableInlineReading, recordKeyField, internalSchema);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,7 +114,7 @@ public class HoodieLogFormatReader implements HoodieLogFormat.Reader {
|
||||
this.prevReadersInOpenState.add(currentReader);
|
||||
}
|
||||
this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false,
|
||||
enableInlineReading, recordKeyField);
|
||||
enableInlineReading, recordKeyField, internalSchema);
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("unable to initialize read with log file ", io);
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -79,10 +80,10 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader
|
||||
ExternalSpillableMap.DiskMapType diskMapType,
|
||||
boolean isBitCaskDiskMapCompressionEnabled,
|
||||
boolean withOperationField, boolean enableFullScan,
|
||||
Option<String> partitionName) {
|
||||
Option<String> partitionName, InternalSchema internalSchema) {
|
||||
super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, readBlocksLazily, reverseReader, bufferSize,
|
||||
instantRange, withOperationField,
|
||||
enableFullScan, partitionName);
|
||||
enableFullScan, partitionName, internalSchema);
|
||||
try {
|
||||
// Store merged records for all versions for this log file, set the in-memory footprint to maxInMemoryMapSize
|
||||
this.records = new ExternalSpillableMap<>(maxMemorySizeInBytes, spillableMapBasePath, new DefaultSizeEstimator(),
|
||||
@@ -197,6 +198,7 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader
|
||||
protected String basePath;
|
||||
protected List<String> logFilePaths;
|
||||
protected Schema readerSchema;
|
||||
private InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema();
|
||||
protected String latestInstantTime;
|
||||
protected boolean readBlocksLazily;
|
||||
protected boolean reverseReader;
|
||||
@@ -293,6 +295,11 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withInternalSchema(InternalSchema internalSchema) {
|
||||
this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withOperationField(boolean withOperationField) {
|
||||
this.withOperationField = withOperationField;
|
||||
return this;
|
||||
@@ -310,7 +317,7 @@ public class HoodieMergedLogRecordScanner extends AbstractHoodieLogRecordReader
|
||||
latestInstantTime, maxMemorySizeInBytes, readBlocksLazily, reverseReader,
|
||||
bufferSize, spillableMapBasePath, instantRange, autoScan,
|
||||
diskMapType, isBitCaskDiskMapCompressionEnabled, withOperationField, true,
|
||||
Option.ofNullable(partitionName));
|
||||
Option.ofNullable(partitionName), internalSchema);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.ClosableIterator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.ByteArrayInputStream;
|
||||
@@ -62,6 +63,17 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
|
||||
private final ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
|
||||
|
||||
public HoodieAvroDataBlock(FSDataInputStream inputStream,
|
||||
Option<byte[]> content,
|
||||
boolean readBlockLazily,
|
||||
HoodieLogBlockContentLocation logBlockContentLocation,
|
||||
Option<Schema> readerSchema,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
String keyField, InternalSchema internalSchema) {
|
||||
super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false, internalSchema);
|
||||
}
|
||||
|
||||
public HoodieAvroDataBlock(FSDataInputStream inputStream,
|
||||
Option<byte[]> content,
|
||||
boolean readBlockLazily,
|
||||
@@ -126,7 +138,7 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
@Override
|
||||
protected ClosableIterator<IndexedRecord> deserializeRecords(byte[] content) throws IOException {
|
||||
checkState(this.readerSchema != null, "Reader's schema has to be non-null");
|
||||
return RecordIterator.getInstance(this, content);
|
||||
return RecordIterator.getInstance(this, content, internalSchema);
|
||||
}
|
||||
|
||||
private static class RecordIterator implements ClosableIterator<IndexedRecord> {
|
||||
@@ -138,7 +150,7 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
private int totalRecords = 0;
|
||||
private int readRecords = 0;
|
||||
|
||||
private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content) throws IOException {
|
||||
private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, InternalSchema internalSchema) throws IOException {
|
||||
this.content = content;
|
||||
|
||||
this.dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(this.content)));
|
||||
@@ -147,17 +159,26 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
int version = this.dis.readInt();
|
||||
HoodieAvroDataBlockVersion logBlockVersion = new HoodieAvroDataBlockVersion(version);
|
||||
|
||||
this.reader = new GenericDatumReader<>(writerSchema, readerSchema);
|
||||
Schema finalReadSchema = readerSchema;
|
||||
if (!internalSchema.isEmptySchema()) {
|
||||
// we should use write schema to read log file,
|
||||
// since when we have done some DDL operation, the readerSchema maybe different from writeSchema, avro reader will throw exception.
|
||||
// eg: origin writeSchema is: "a String, b double" then we add a new column now the readerSchema will be: "a string, c int, b double". it's wrong to use readerSchema to read old log file.
|
||||
// after we read those record by writeSchema, we rewrite those record with readerSchema in AbstractHoodieLogRecordReader
|
||||
finalReadSchema = writerSchema;
|
||||
}
|
||||
|
||||
this.reader = new GenericDatumReader<>(writerSchema, finalReadSchema);
|
||||
|
||||
if (logBlockVersion.hasRecordCount()) {
|
||||
this.totalRecords = this.dis.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content) throws IOException {
|
||||
public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content, InternalSchema internalSchema) throws IOException {
|
||||
// Get schema from the header
|
||||
Schema writerSchema = new Schema.Parser().parse(dataBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
return new RecordIterator(dataBlock.readerSchema, writerSchema, content);
|
||||
return new RecordIterator(dataBlock.readerSchema, writerSchema, content, internalSchema);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -209,12 +230,16 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
super(records, Collections.singletonMap(HeaderMetadataType.SCHEMA, schema.toString()), new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
}
|
||||
|
||||
public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) throws IOException {
|
||||
return getBlock(content, readerSchema, InternalSchema.getEmptyInternalSchema());
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is retained to provide backwards compatibility to HoodieArchivedLogs which were written using
|
||||
* HoodieLogFormat V1.
|
||||
*/
|
||||
@Deprecated
|
||||
public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) throws IOException {
|
||||
public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema, InternalSchema internalSchema) throws IOException {
|
||||
|
||||
SizeAwareDataInputStream dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(content)));
|
||||
|
||||
@@ -228,6 +253,10 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
readerSchema = writerSchema;
|
||||
}
|
||||
|
||||
if (!internalSchema.isEmptySchema()) {
|
||||
readerSchema = writerSchema;
|
||||
}
|
||||
|
||||
GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(writerSchema, readerSchema);
|
||||
// 2. Get the total records
|
||||
int totalRecords = dis.readInt();
|
||||
|
||||
@@ -25,6 +25,7 @@ import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
@@ -60,6 +61,8 @@ public abstract class HoodieDataBlock extends HoodieLogBlock {
|
||||
|
||||
protected final Schema readerSchema;
|
||||
|
||||
protected InternalSchema internalSchema = InternalSchema.getEmptyInternalSchema();
|
||||
|
||||
/**
|
||||
* NOTE: This ctor is used on the write-path (ie when records ought to be written into the log)
|
||||
*/
|
||||
@@ -95,6 +98,25 @@ public abstract class HoodieDataBlock extends HoodieLogBlock {
|
||||
this.enablePointLookups = enablePointLookups;
|
||||
}
|
||||
|
||||
protected HoodieDataBlock(Option<byte[]> content,
|
||||
FSDataInputStream inputStream,
|
||||
boolean readBlockLazily,
|
||||
Option<HoodieLogBlockContentLocation> blockContentLocation,
|
||||
Option<Schema> readerSchema,
|
||||
Map<HeaderMetadataType, String> headers,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
String keyFieldName,
|
||||
boolean enablePointLookups,
|
||||
InternalSchema internalSchema) {
|
||||
super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
this.records = Option.empty();
|
||||
this.keyFieldName = keyFieldName;
|
||||
// If no reader-schema has been provided assume writer-schema as one
|
||||
this.readerSchema = readerSchema.orElseGet(() -> getWriterSchema(super.getLogBlockHeader()));
|
||||
this.enablePointLookups = enablePointLookups;
|
||||
this.internalSchema = internalSchema == null ? InternalSchema.getEmptyInternalSchema() : internalSchema;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getContentBytes() throws IOException {
|
||||
// In case this method is called before realizing records from content
|
||||
|
||||
@@ -74,7 +74,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
REQUESTED_RESTORE_EXTENSION, INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION,
|
||||
ROLLBACK_EXTENSION, REQUESTED_ROLLBACK_EXTENSION, INFLIGHT_ROLLBACK_EXTENSION,
|
||||
REQUESTED_REPLACE_COMMIT_EXTENSION, INFLIGHT_REPLACE_COMMIT_EXTENSION, REPLACE_COMMIT_EXTENSION,
|
||||
REQUESTED_INDEX_COMMIT_EXTENSION, INFLIGHT_INDEX_COMMIT_EXTENSION, INDEX_COMMIT_EXTENSION));
|
||||
REQUESTED_INDEX_COMMIT_EXTENSION, INFLIGHT_INDEX_COMMIT_EXTENSION, INDEX_COMMIT_EXTENSION,
|
||||
REQUESTED_SAVE_SCHEMA_ACTION_EXTENSION, INFLIGHT_SAVE_SCHEMA_ACTION_EXTENSION, SAVE_SCHEMA_ACTION_EXTENSION));
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieActiveTimeline.class);
|
||||
protected HoodieTableMetaClient metaClient;
|
||||
|
||||
@@ -227,7 +228,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
public void deleteInstantFileIfExists(HoodieInstant instant) {
|
||||
LOG.info("Deleting instant " + instant);
|
||||
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), instant.getFileName());
|
||||
Path inFlightCommitFilePath = getInstantFileNamePath(instant.getFileName());
|
||||
try {
|
||||
if (metaClient.getFs().exists(inFlightCommitFilePath)) {
|
||||
boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false);
|
||||
@@ -246,7 +247,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
private void deleteInstantFile(HoodieInstant instant) {
|
||||
LOG.info("Deleting instant " + instant);
|
||||
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), instant.getFileName());
|
||||
Path inFlightCommitFilePath = getInstantFileNamePath(instant.getFileName());
|
||||
try {
|
||||
boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false);
|
||||
if (result) {
|
||||
@@ -261,7 +262,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
@Override
|
||||
public Option<byte[]> getInstantDetails(HoodieInstant instant) {
|
||||
Path detailPath = new Path(metaClient.getMetaPath(), instant.getFileName());
|
||||
Path detailPath = getInstantFileNamePath(instant.getFileName());
|
||||
return readDataFromPath(detailPath);
|
||||
}
|
||||
|
||||
@@ -307,12 +308,12 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
public Option<byte[]> readCleanerInfoAsBytes(HoodieInstant instant) {
|
||||
// Cleaner metadata are always stored only in timeline .hoodie
|
||||
return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName()));
|
||||
return readDataFromPath(getInstantFileNamePath(instant.getFileName()));
|
||||
}
|
||||
|
||||
public Option<byte[]> readRollbackInfoAsBytes(HoodieInstant instant) {
|
||||
// Rollback metadata are always stored only in timeline .hoodie
|
||||
return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName()));
|
||||
return readDataFromPath(getInstantFileNamePath(instant.getFileName()));
|
||||
}
|
||||
|
||||
public Option<byte[]> readRestoreInfoAsBytes(HoodieInstant instant) {
|
||||
@@ -542,24 +543,23 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
if (metaClient.getTimelineLayoutVersion().isNullVersion()) {
|
||||
// Re-create the .inflight file by opening a new file and write the commit metadata in
|
||||
createFileInMetaPath(fromInstant.getFileName(), data, allowRedundantTransitions);
|
||||
Path fromInstantPath = new Path(metaClient.getMetaPath(), fromInstant.getFileName());
|
||||
Path toInstantPath = new Path(metaClient.getMetaPath(), toInstant.getFileName());
|
||||
Path fromInstantPath = getInstantFileNamePath(fromInstant.getFileName());
|
||||
Path toInstantPath = getInstantFileNamePath(toInstant.getFileName());
|
||||
boolean success = metaClient.getFs().rename(fromInstantPath, toInstantPath);
|
||||
if (!success) {
|
||||
throw new HoodieIOException("Could not rename " + fromInstantPath + " to " + toInstantPath);
|
||||
}
|
||||
} else {
|
||||
// Ensures old state exists in timeline
|
||||
LOG.info("Checking for file exists ?" + new Path(metaClient.getMetaPath(), fromInstant.getFileName()));
|
||||
ValidationUtils.checkArgument(metaClient.getFs().exists(new Path(metaClient.getMetaPath(),
|
||||
fromInstant.getFileName())));
|
||||
LOG.info("Checking for file exists ?" + getInstantFileNamePath(fromInstant.getFileName()));
|
||||
ValidationUtils.checkArgument(metaClient.getFs().exists(getInstantFileNamePath(fromInstant.getFileName())));
|
||||
// Use Write Once to create Target File
|
||||
if (allowRedundantTransitions) {
|
||||
FileIOUtils.createFileInPath(metaClient.getFs(), new Path(metaClient.getMetaPath(), toInstant.getFileName()), data);
|
||||
FileIOUtils.createFileInPath(metaClient.getFs(), getInstantFileNamePath(toInstant.getFileName()), data);
|
||||
} else {
|
||||
createImmutableFileInPath(new Path(metaClient.getMetaPath(), toInstant.getFileName()), data);
|
||||
createImmutableFileInPath(getInstantFileNamePath(toInstant.getFileName()), data);
|
||||
}
|
||||
LOG.info("Create new file for toInstant ?" + new Path(metaClient.getMetaPath(), toInstant.getFileName()));
|
||||
LOG.info("Create new file for toInstant ?" + getInstantFileNamePath(toInstant.getFileName()));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Could not complete " + fromInstant, e);
|
||||
@@ -568,8 +568,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
private void revertCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) {
|
||||
ValidationUtils.checkArgument(completed.getTimestamp().equals(inflight.getTimestamp()));
|
||||
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), inflight.getFileName());
|
||||
Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName());
|
||||
Path inFlightCommitFilePath = getInstantFileNamePath(inflight.getFileName());
|
||||
Path commitFilePath = getInstantFileNamePath(completed.getFileName());
|
||||
try {
|
||||
if (metaClient.getTimelineLayoutVersion().isNullVersion()) {
|
||||
if (!metaClient.getFs().exists(inFlightCommitFilePath)) {
|
||||
@@ -580,8 +580,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Path requestedInstantFilePath = new Path(metaClient.getMetaPath(),
|
||||
new HoodieInstant(State.REQUESTED, inflight.getAction(), inflight.getTimestamp()).getFileName());
|
||||
Path requestedInstantFilePath = getInstantFileNamePath(new HoodieInstant(State.REQUESTED,
|
||||
inflight.getAction(), inflight.getTimestamp()).getFileName());
|
||||
|
||||
// If inflight and requested files do not exist, create one
|
||||
if (!metaClient.getFs().exists(requestedInstantFilePath)) {
|
||||
@@ -600,6 +600,10 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
private Path getInstantFileNamePath(String fileName) {
|
||||
return new Path(fileName.contains(SCHEMA_COMMIT_ACTION) ? metaClient.getSchemaFolderName() : metaClient.getMetaPath(), fileName);
|
||||
}
|
||||
|
||||
public void transitionRequestedToInflight(String commitType, String inFlightInstant) {
|
||||
HoodieInstant requested = new HoodieInstant(HoodieInstant.State.REQUESTED, commitType, inFlightInstant);
|
||||
transitionRequestedToInflight(requested, Option.empty(), false);
|
||||
@@ -716,7 +720,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
|
||||
private void createFileInMetaPath(String filename, Option<byte[]> content, boolean allowOverwrite) {
|
||||
Path fullPath = new Path(metaClient.getMetaPath(), filename);
|
||||
Path fullPath = getInstantFileNamePath(filename);
|
||||
if (allowOverwrite || metaClient.getTimelineLayoutVersion().isNullVersion()) {
|
||||
FileIOUtils.createFileInPath(metaClient.getFs(), fullPath, content);
|
||||
} else {
|
||||
|
||||
@@ -176,6 +176,10 @@ public class HoodieInstant implements Serializable, Comparable<HoodieInstant> {
|
||||
return isInflight() ? HoodieTimeline.makeInflightIndexFileName(timestamp)
|
||||
: isRequested() ? HoodieTimeline.makeRequestedIndexFileName(timestamp)
|
||||
: HoodieTimeline.makeIndexCommitFileName(timestamp);
|
||||
} else if (HoodieTimeline.SCHEMA_COMMIT_ACTION.equals(action)) {
|
||||
return isInflight() ? HoodieTimeline.makeInflightSchemaFileName(timestamp)
|
||||
: isRequested() ? HoodieTimeline.makeRequestSchemaFileName(timestamp)
|
||||
: HoodieTimeline.makeSchemaFileName(timestamp);
|
||||
}
|
||||
throw new IllegalArgumentException("Cannot get file name for unknown action " + action);
|
||||
}
|
||||
|
||||
@@ -56,6 +56,8 @@ public interface HoodieTimeline extends Serializable {
|
||||
String REQUESTED_EXTENSION = ".requested";
|
||||
String RESTORE_ACTION = "restore";
|
||||
String INDEXING_ACTION = "indexing";
|
||||
// only for schema save
|
||||
String SCHEMA_COMMIT_ACTION = "schemacommit";
|
||||
|
||||
String[] VALID_ACTIONS_IN_TIMELINE = {COMMIT_ACTION, DELTA_COMMIT_ACTION,
|
||||
CLEAN_ACTION, SAVEPOINT_ACTION, RESTORE_ACTION, ROLLBACK_ACTION,
|
||||
@@ -88,6 +90,9 @@ public interface HoodieTimeline extends Serializable {
|
||||
String INFLIGHT_INDEX_COMMIT_EXTENSION = "." + INDEXING_ACTION + INFLIGHT_EXTENSION;
|
||||
String REQUESTED_INDEX_COMMIT_EXTENSION = "." + INDEXING_ACTION + REQUESTED_EXTENSION;
|
||||
String INDEX_COMMIT_EXTENSION = "." + INDEXING_ACTION;
|
||||
String SAVE_SCHEMA_ACTION_EXTENSION = "." + SCHEMA_COMMIT_ACTION;
|
||||
String INFLIGHT_SAVE_SCHEMA_ACTION_EXTENSION = "." + SCHEMA_COMMIT_ACTION + INFLIGHT_EXTENSION;
|
||||
String REQUESTED_SAVE_SCHEMA_ACTION_EXTENSION = "." + SCHEMA_COMMIT_ACTION + REQUESTED_EXTENSION;
|
||||
|
||||
String INVALID_INSTANT_TS = "0";
|
||||
|
||||
@@ -497,4 +502,16 @@ public interface HoodieTimeline extends Serializable {
|
||||
static String makeRequestedIndexFileName(String instant) {
|
||||
return StringUtils.join(instant, HoodieTimeline.REQUESTED_INDEX_COMMIT_EXTENSION);
|
||||
}
|
||||
|
||||
static String makeSchemaFileName(String instantTime) {
|
||||
return StringUtils.join(instantTime, HoodieTimeline.SAVE_SCHEMA_ACTION_EXTENSION);
|
||||
}
|
||||
|
||||
static String makeInflightSchemaFileName(String instantTime) {
|
||||
return StringUtils.join(instantTime, HoodieTimeline.INFLIGHT_SAVE_SCHEMA_ACTION_EXTENSION);
|
||||
}
|
||||
|
||||
static String makeRequestSchemaFileName(String instantTime) {
|
||||
return StringUtils.join(instantTime, HoodieTimeline.REQUESTED_SAVE_SCHEMA_ACTION_EXTENSION);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,212 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util;
|
||||
|
||||
import com.github.benmanes.caffeine.cache.Cache;
|
||||
import com.github.benmanes.caffeine.cache.Caffeine;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
|
||||
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
|
||||
import org.apache.hudi.internal.schema.utils.SerDeHelper;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class InternalSchemaCache {
|
||||
private static final Logger LOG = LogManager.getLogger(InternalSchemaCache.class);
|
||||
// Use segment lock to reduce competition.
|
||||
// the lock size should be powers of 2 for better hash.
|
||||
private static Object[] lockList = new Object[16];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < lockList.length; i++) {
|
||||
lockList[i] = new Object();
|
||||
}
|
||||
}
|
||||
|
||||
// historySchemas cache maintain a map about (tablePath, HistorySchemas).
|
||||
// this is a Global cache, all threads in one container/executor share the same cache.
|
||||
private static final Cache<String, TreeMap<Long, InternalSchema>>
|
||||
HISTORICAL_SCHEMA_CACHE = Caffeine.newBuilder().maximumSize(1000).weakValues().build();
|
||||
|
||||
/**
|
||||
* Search internalSchema based on versionID.
|
||||
* first step: try to get internalSchema from hoodie commit files, we no need to add lock.
|
||||
* if we cannot get internalSchema by first step, then we try to get internalSchema from cache.
|
||||
*
|
||||
* @param versionID schema version_id need to search
|
||||
* @param metaClient current hoodie metaClient
|
||||
* @return internalSchema
|
||||
*/
|
||||
public static InternalSchema searchSchemaAndCache(long versionID, HoodieTableMetaClient metaClient, boolean cacheEnable) {
|
||||
Option<InternalSchema> candidateSchema = getSchemaByReadingCommitFile(versionID, metaClient);
|
||||
if (candidateSchema.isPresent()) {
|
||||
return candidateSchema.get();
|
||||
}
|
||||
if (!cacheEnable) {
|
||||
// parse history schema and return directly
|
||||
return InternalSchemaUtils.searchSchema(versionID, getHistoricalSchemas(metaClient));
|
||||
}
|
||||
String tablePath = metaClient.getBasePath();
|
||||
// use segment lock to reduce competition.
|
||||
synchronized (lockList[tablePath.hashCode() & (lockList.length - 1)]) {
|
||||
TreeMap<Long, InternalSchema> historicalSchemas = HISTORICAL_SCHEMA_CACHE.getIfPresent(tablePath);
|
||||
if (historicalSchemas == null || InternalSchemaUtils.searchSchema(versionID, historicalSchemas) == null) {
|
||||
historicalSchemas = getHistoricalSchemas(metaClient);
|
||||
HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas);
|
||||
} else {
|
||||
long maxVersionId = historicalSchemas.keySet().stream().max(Long::compareTo).get();
|
||||
if (versionID > maxVersionId) {
|
||||
historicalSchemas = getHistoricalSchemas(metaClient);
|
||||
HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas);
|
||||
}
|
||||
}
|
||||
return InternalSchemaUtils.searchSchema(versionID, historicalSchemas);
|
||||
}
|
||||
}
|
||||
|
||||
private static TreeMap<Long, InternalSchema> getHistoricalSchemas(HoodieTableMetaClient metaClient) {
|
||||
TreeMap<Long, InternalSchema> result = new TreeMap<>();
|
||||
FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient);
|
||||
String historySchemaStr = schemasManager.getHistorySchemaStr();
|
||||
if (!StringUtils.isNullOrEmpty(historySchemaStr)) {
|
||||
result = SerDeHelper.parseSchemas(historySchemaStr);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Option<InternalSchema> getSchemaByReadingCommitFile(long versionID, HoodieTableMetaClient metaClient) {
|
||||
try {
|
||||
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> instants = timeline.getInstants().filter(f -> f.getTimestamp().equals(String.valueOf(versionID))).collect(Collectors.toList());
|
||||
if (instants.isEmpty()) {
|
||||
return Option.empty();
|
||||
}
|
||||
byte[] data = timeline.getInstantDetails(instants.get(0)).get();
|
||||
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
||||
String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
|
||||
return SerDeHelper.fromJson(latestInternalSchemaStr);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Failed to read schema from commit metadata", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get internalSchema and avroSchema for compaction/cluster operation.
|
||||
*
|
||||
* @param metaClient current hoodie metaClient
|
||||
* @param compactionAndClusteringInstant first instant before current compaction/cluster instant
|
||||
* @return (internalSchemaStrOpt, avroSchemaStrOpt) a pair of InternalSchema/avroSchema
|
||||
*/
|
||||
public static Pair<Option<String>, Option<String>> getInternalSchemaAndAvroSchemaForClusteringAndCompaction(HoodieTableMetaClient metaClient, String compactionAndClusteringInstant) {
|
||||
// try to load internalSchema to support Schema Evolution
|
||||
HoodieTimeline timelineBeforeCurrentCompaction = metaClient.getCommitsAndCompactionTimeline().findInstantsBefore(compactionAndClusteringInstant).filterCompletedInstants();
|
||||
Option<HoodieInstant> lastInstantBeforeCurrentCompaction = timelineBeforeCurrentCompaction.lastInstant();
|
||||
if (lastInstantBeforeCurrentCompaction.isPresent()) {
|
||||
// try to find internalSchema
|
||||
byte[] data = timelineBeforeCurrentCompaction.getInstantDetails(lastInstantBeforeCurrentCompaction.get()).get();
|
||||
HoodieCommitMetadata metadata;
|
||||
try {
|
||||
metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException(String.format("cannot read metadata from commit: %s", lastInstantBeforeCurrentCompaction.get()), e);
|
||||
}
|
||||
String internalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
|
||||
if (internalSchemaStr != null) {
|
||||
String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
|
||||
return Pair.of(Option.of(internalSchemaStr), Option.of(existingSchemaStr));
|
||||
}
|
||||
}
|
||||
return Pair.of(Option.empty(), Option.empty());
|
||||
}
|
||||
|
||||
/**
|
||||
* Give a schema versionId return its internalSchema.
|
||||
* This method will be called by spark tasks, we should minimize time cost.
|
||||
* We try our best to not use metaClient, since the initialization of metaClient is time cost
|
||||
* step1:
|
||||
* try to parser internalSchema from HoodieInstant directly
|
||||
* step2:
|
||||
* if we cannot parser internalSchema in step1,
|
||||
* try to find internalSchema in historySchema.
|
||||
*
|
||||
* @param versionId the internalSchema version to be search.
|
||||
* @param tablePath table path
|
||||
* @param hadoopConf conf
|
||||
* @param validCommits current validate commits, use to make up the commit file path/verify the validity of the history schema files
|
||||
* @return a internalSchema.
|
||||
*/
|
||||
public static InternalSchema getInternalSchemaByVersionId(long versionId, String tablePath, Configuration hadoopConf, String validCommits) {
|
||||
Set<String> commitSet = Arrays.stream(validCommits.split(",")).collect(Collectors.toSet());
|
||||
List<String> validateCommitList = commitSet.stream().map(fileName -> {
|
||||
String fileExtension = HoodieInstant.getTimelineFileExtension(fileName);
|
||||
return fileName.replace(fileExtension, "");
|
||||
}).collect(Collectors.toList());
|
||||
|
||||
FileSystem fs = FSUtils.getFs(tablePath, hadoopConf);
|
||||
Path hoodieMetaPath = new Path(tablePath, HoodieTableMetaClient.METAFOLDER_NAME);
|
||||
//step1:
|
||||
Path candidateCommitFile = commitSet.stream().filter(fileName -> {
|
||||
String fileExtension = HoodieInstant.getTimelineFileExtension(fileName);
|
||||
return fileName.replace(fileExtension, "").equals(versionId + "");
|
||||
}).findFirst().map(f -> new Path(hoodieMetaPath, f)).orElse(null);
|
||||
if (candidateCommitFile != null) {
|
||||
try {
|
||||
byte[] data;
|
||||
try (FSDataInputStream is = fs.open(candidateCommitFile)) {
|
||||
data = FileIOUtils.readAsByteArray(is);
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
}
|
||||
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
||||
String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
|
||||
if (latestInternalSchemaStr != null) {
|
||||
return SerDeHelper.fromJson(latestInternalSchemaStr).orElse(null);
|
||||
}
|
||||
} catch (Exception e1) {
|
||||
// swallow this exception.
|
||||
LOG.warn(String.format("Cannot find internal schema from commit file %s. Falling back to parsing historical internal schema", candidateCommitFile.toString()));
|
||||
}
|
||||
}
|
||||
// step2:
|
||||
FileBasedInternalSchemaStorageManager fileBasedInternalSchemaStorageManager = new FileBasedInternalSchemaStorageManager(hadoopConf, new Path(tablePath));
|
||||
String lastestHistorySchema = fileBasedInternalSchemaStorageManager.getHistorySchemaStrByGivenValidCommits(validateCommitList);
|
||||
return InternalSchemaUtils.searchSchema(versionId, SerDeHelper.parseSchemas(lastestHistorySchema));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema;
|
||||
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
/**
|
||||
* Exception thrown for Hoodie schema convert failures. The root of the exception hierarchy.
|
||||
* Hoodie Write/Read clients will throw this exception if any of its operations fail. This is a runtime (unchecked)
|
||||
* exception.
|
||||
*/
|
||||
public class HoodieSchemaException extends HoodieException {
|
||||
public HoodieSchemaException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public HoodieSchemaException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public HoodieSchemaException(String message, Throwable t) {
|
||||
super(message, t);
|
||||
}
|
||||
|
||||
public HoodieSchemaException(Throwable t) {
|
||||
super(t);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema;
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.internal.schema.Types.Field;
|
||||
import org.apache.hudi.internal.schema.Types.RecordType;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Internal schema for hudi table.
|
||||
* used to support schema evolution.
|
||||
*/
|
||||
public class InternalSchema implements Serializable {
|
||||
|
||||
private static final long DEFAULT_VERSION_ID = 0;
|
||||
|
||||
private final RecordType record;
|
||||
|
||||
private int maxColumnId;
|
||||
private long versionId;
|
||||
|
||||
private transient Map<Integer, Field> idToField = null;
|
||||
private transient Map<String, Integer> nameToId = null;
|
||||
private transient Map<Integer, String> idToName = null;
|
||||
|
||||
public static InternalSchema getEmptyInternalSchema() {
|
||||
return new InternalSchema(-1L, new ArrayList<>());
|
||||
}
|
||||
|
||||
public boolean isEmptySchema() {
|
||||
return versionId < 0;
|
||||
}
|
||||
|
||||
public InternalSchema(List<Field> columns) {
|
||||
this(DEFAULT_VERSION_ID, columns);
|
||||
}
|
||||
|
||||
public InternalSchema(Field... columns) {
|
||||
this(DEFAULT_VERSION_ID, Arrays.asList(columns));
|
||||
}
|
||||
|
||||
public InternalSchema(long versionId, List<Field> cols) {
|
||||
this.versionId = versionId;
|
||||
this.record = RecordType.get(cols);
|
||||
idToName = cols.isEmpty() ? new HashMap<>() : InternalSchemaBuilder.getBuilder().buildIdToName(record);
|
||||
nameToId = cols.isEmpty() ? new HashMap<>() : idToName.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
|
||||
maxColumnId = idToName.isEmpty() ? -1 : idToName.keySet().stream().max(Comparator.comparing(Integer::valueOf)).get();
|
||||
}
|
||||
|
||||
public InternalSchema(long versionId, int maxColumnId, List<Field> cols) {
|
||||
this.maxColumnId = maxColumnId;
|
||||
this.versionId = versionId;
|
||||
this.record = RecordType.get(cols);
|
||||
buildIdToName();
|
||||
}
|
||||
|
||||
public InternalSchema(long versionId, int maxColumnId, Field... cols) {
|
||||
this(versionId, maxColumnId, Arrays.asList(cols));
|
||||
}
|
||||
|
||||
public RecordType getRecord() {
|
||||
return record;
|
||||
}
|
||||
|
||||
private Map<Integer, String> buildIdToName() {
|
||||
if (idToName == null) {
|
||||
idToName = InternalSchemaBuilder.getBuilder().buildIdToName(record);
|
||||
}
|
||||
return idToName;
|
||||
}
|
||||
|
||||
private Map<String, Integer> buildNameToId() {
|
||||
if (nameToId == null) {
|
||||
if (idToName != null && !idToName.isEmpty()) {
|
||||
nameToId = idToName.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
|
||||
return nameToId;
|
||||
}
|
||||
nameToId = InternalSchemaBuilder.getBuilder().buildNameToId(record);
|
||||
}
|
||||
return nameToId;
|
||||
}
|
||||
|
||||
private Map<Integer, Field> buildIdToField() {
|
||||
if (idToField == null) {
|
||||
idToField = InternalSchemaBuilder.getBuilder().buildIdToField(record);
|
||||
}
|
||||
return idToField;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all columns full name.
|
||||
*/
|
||||
public List<String> getAllColsFullName() {
|
||||
if (nameToId == null) {
|
||||
nameToId = InternalSchemaBuilder.getBuilder().buildNameToId(record);
|
||||
}
|
||||
return Arrays.asList(nameToId.keySet().toArray(new String[0]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the version ID for this schema.
|
||||
*/
|
||||
public InternalSchema setSchemaId(long versionId) {
|
||||
this.versionId = versionId;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the version ID for this schema.
|
||||
*/
|
||||
public long schemaId() {
|
||||
return this.versionId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the version ID for this schema.
|
||||
*/
|
||||
public void setMaxColumnId(int maxColumnId) {
|
||||
this.maxColumnId = maxColumnId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the max column id for this schema.
|
||||
*/
|
||||
public int getMaxColumnId() {
|
||||
return this.maxColumnId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a List of the {@link Field columns} in this Schema.
|
||||
*/
|
||||
public List<Field> columns() {
|
||||
return record.fields();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the {@link Type} of a sub-field identified by the field name.
|
||||
*
|
||||
* @param id a field id
|
||||
* @return fullName of field of
|
||||
*/
|
||||
public String findfullName(int id) {
|
||||
if (idToName == null) {
|
||||
buildIdToName();
|
||||
}
|
||||
String result = idToName.get(id);
|
||||
return result == null ? "" : result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the {@link Type} of a sub-field identified by the field name.
|
||||
*
|
||||
* @param name a field name
|
||||
* @return a Type for the sub-field or null if it is not found
|
||||
*/
|
||||
public Type findType(String name) {
|
||||
if (name == null || name.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
Integer id = buildNameToId().get(name);
|
||||
if (id != null) { // name is found
|
||||
return findType(id);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the {@link Type} of a sub-field identified by the field id.
|
||||
*
|
||||
* @param id a field id
|
||||
* @return a Type for the sub-field or null if it is not found
|
||||
*/
|
||||
public Type findType(int id) {
|
||||
Field field = buildIdToField().get(id);
|
||||
if (field != null) {
|
||||
return field.type();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all field ids
|
||||
*/
|
||||
public Set<Integer> getAllIds() {
|
||||
if (idToName == null) {
|
||||
buildIdToName();
|
||||
}
|
||||
return idToName.keySet();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the sub-field identified by the field id.
|
||||
*
|
||||
* @param id a field id
|
||||
* @return the sub-field or null if it is not found
|
||||
*/
|
||||
public Field findField(int id) {
|
||||
return buildIdToField().get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a sub-field by name as a {@link Field}.
|
||||
* The result may be a top-level or a nested field.
|
||||
*
|
||||
* @param name a String name
|
||||
* @return a Type for the sub-field or null if it is not found
|
||||
*/
|
||||
public Field findField(String name) {
|
||||
if (name == null || name.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
Integer id = buildNameToId().get(name);
|
||||
if (id != null) {
|
||||
return buildIdToField().get(id);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether colName exists in current Schema.
|
||||
* Case insensitive.
|
||||
*
|
||||
* @param colName a colName
|
||||
* @return Whether colName exists in current Schema
|
||||
*/
|
||||
public boolean findDuplicateCol(String colName) {
|
||||
return idToName.entrySet().stream().map(e -> e.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet()).contains(colName);
|
||||
}
|
||||
|
||||
public int findIdByName(String name) {
|
||||
if (name == null || name.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
return buildNameToId().getOrDefault(name, -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("table {\n%s\n}",
|
||||
StringUtils.join(record.fields().stream()
|
||||
.map(f -> " " + f)
|
||||
.collect(Collectors.toList()).toArray(new String[0]), "\n"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof InternalSchema)) {
|
||||
return false;
|
||||
}
|
||||
InternalSchema that = (InternalSchema) o;
|
||||
if (versionId != that.schemaId()) {
|
||||
return false;
|
||||
}
|
||||
return record.equals(that.record);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return record.hashCode();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,272 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema;
|
||||
|
||||
import org.apache.hudi.internal.schema.visitor.InternalSchemaVisitor;
|
||||
import org.apache.hudi.internal.schema.visitor.NameToIDVisitor;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* A build class to help build fields for InternalSchema
|
||||
*/
|
||||
public class InternalSchemaBuilder implements Serializable {
|
||||
private static final InternalSchemaBuilder INSTANCE = new InternalSchemaBuilder();
|
||||
|
||||
public static InternalSchemaBuilder getBuilder() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
private InternalSchemaBuilder() {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Build a mapping from id to full field name for a internal Type.
|
||||
* if a field y belong to a struct filed x, then the full name of y is x.y
|
||||
*
|
||||
* @param type hoodie internal type
|
||||
* @return a mapping from id to full field name
|
||||
*/
|
||||
public Map<Integer, String> buildIdToName(Type type) {
|
||||
Map<Integer, String> result = new HashMap<>();
|
||||
buildNameToId(type).forEach((k, v) -> result.put(v, k));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a mapping from full field name to id for a internal Type.
|
||||
* if a field y belong to a struct filed x, then the full name of y is x.y
|
||||
*
|
||||
* @param type hoodie internal type
|
||||
* @return a mapping from full field name to id
|
||||
*/
|
||||
public Map<String, Integer> buildNameToId(Type type) {
|
||||
return visit(type, new NameToIDVisitor());
|
||||
}
|
||||
|
||||
/**
|
||||
* Use to traverse all types in internalSchema with visitor.
|
||||
*
|
||||
* @param schema hoodie internal schema
|
||||
* @return vistor expected result.
|
||||
*/
|
||||
public <T> T visit(InternalSchema schema, InternalSchemaVisitor<T> visitor) {
|
||||
return visitor.schema(schema, visit(schema.getRecord(), visitor));
|
||||
}
|
||||
|
||||
public <T> T visit(Type type, InternalSchemaVisitor<T> visitor) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<T> results = new ArrayList<>();
|
||||
for (Types.Field f : record.fields()) {
|
||||
visitor.beforeField(f);
|
||||
T result;
|
||||
try {
|
||||
result = visit(f.type(), visitor);
|
||||
} finally {
|
||||
visitor.afterField(f);
|
||||
}
|
||||
results.add(visitor.field(f, result));
|
||||
}
|
||||
return visitor.record(record, results);
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
T elementResult;
|
||||
Types.Field elementField = array.field(array.elementId());
|
||||
visitor.beforeArrayElement(elementField);
|
||||
try {
|
||||
elementResult = visit(elementField.type(), visitor);
|
||||
} finally {
|
||||
visitor.afterArrayElement(elementField);
|
||||
}
|
||||
return visitor.array(array, elementResult);
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
T keyResult;
|
||||
T valueResult;
|
||||
Types.Field keyField = map.field(map.keyId());
|
||||
visitor.beforeMapKey(keyField);
|
||||
try {
|
||||
keyResult = visit(map.keyType(), visitor);
|
||||
} finally {
|
||||
visitor.afterMapKey(keyField);
|
||||
}
|
||||
Types.Field valueField = map.field(map.valueId());
|
||||
visitor.beforeMapValue(valueField);
|
||||
try {
|
||||
valueResult = visit(map.valueType(), visitor);
|
||||
} finally {
|
||||
visitor.afterMapValue(valueField);
|
||||
}
|
||||
return visitor.map(map, keyResult, valueResult);
|
||||
default:
|
||||
return visitor.primitive((Type.PrimitiveType)type);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a mapping from id to field for a internal Type.
|
||||
*
|
||||
* @param type hoodie internal type
|
||||
* @return a mapping from id to field
|
||||
*/
|
||||
public Map<Integer, Types.Field> buildIdToField(Type type) {
|
||||
Map<Integer, Types.Field> idToField = new HashMap<>();
|
||||
visitIdToField(type, idToField);
|
||||
return idToField;
|
||||
}
|
||||
|
||||
private void visitIdToField(Type type, Map<Integer, Types.Field> index) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
for (Types.Field field : record.fields()) {
|
||||
visitIdToField(field.type(), index);
|
||||
index.put(field.fieldId(), field);
|
||||
}
|
||||
return;
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
visitIdToField(array.elementType(), index);
|
||||
for (Types.Field field : array.fields()) {
|
||||
index.put(field.fieldId(), field);
|
||||
}
|
||||
return;
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
visitIdToField(map.keyType(), index);
|
||||
visitIdToField(map.valueType(), index);
|
||||
for (Types.Field field : map.fields()) {
|
||||
index.put(field.fieldId(), field);
|
||||
}
|
||||
return;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a mapping which maintain the relation between child field id and it's parent field id.
|
||||
* if a child field y(which id is 9) belong to a nest field x(which id is 6), then (9 -> 6) will be added to the result map.
|
||||
* if a field has no parent field, nothings will be added.
|
||||
*
|
||||
* @param record hoodie record type.
|
||||
* @return a mapping from id to parentId for a record Type
|
||||
*/
|
||||
public Map<Integer, Integer> index2Parents(Types.RecordType record) {
|
||||
Map<Integer, Integer> result = new HashMap<>();
|
||||
Deque<Integer> parentIds = new LinkedList<>();
|
||||
index2Parents(record, parentIds, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private void index2Parents(Type type, Deque<Integer> pids, Map<Integer, Integer> id2p) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType)type;
|
||||
for (Types.Field f : record.fields()) {
|
||||
pids.push(f.fieldId());
|
||||
index2Parents(f.type(), pids, id2p);
|
||||
pids.pop();
|
||||
}
|
||||
|
||||
for (Types.Field f : record.fields()) {
|
||||
// root record has no parent id.
|
||||
if (!pids.isEmpty()) {
|
||||
Integer pid = pids.peek();
|
||||
id2p.put(f.fieldId(), pid);
|
||||
}
|
||||
}
|
||||
return;
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Types.Field elementField = array.field(array.elementId());
|
||||
pids.push(elementField.fieldId());
|
||||
index2Parents(elementField.type(), pids, id2p);
|
||||
pids.pop();
|
||||
id2p.put(array.elementId(), pids.peek());
|
||||
return;
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
Types.Field keyField = map.field(map.keyId());
|
||||
Types.Field valueField = map.field(map.valueId());
|
||||
// visit key
|
||||
pids.push(map.keyId());
|
||||
index2Parents(keyField.type(), pids, id2p);
|
||||
pids.pop();
|
||||
// visit value
|
||||
pids.push(map.valueId());
|
||||
index2Parents(valueField.type(), pids, id2p);
|
||||
pids.pop();
|
||||
id2p.put(map.keyId(), pids.peek());
|
||||
id2p.put(map.valueId(), pids.peek());
|
||||
return;
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assigns new ids for all fields in a Type, based on initial id.
|
||||
*
|
||||
* @param type a type.
|
||||
* @param nextId initial id which used to fresh ids for all fields in a type
|
||||
* @return a new type with new ids
|
||||
*/
|
||||
public Type refreshNewId(Type type, AtomicInteger nextId) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Types.Field> oldFields = record.fields();
|
||||
int currentId = nextId.get();
|
||||
nextId.set(currentId + record.fields().size());
|
||||
List<Types.Field> internalFields = new ArrayList<>();
|
||||
for (int i = 0; i < oldFields.size(); i++) {
|
||||
Types.Field oldField = oldFields.get(i);
|
||||
Type fieldType = refreshNewId(oldField.type(), nextId);
|
||||
internalFields.add(Types.Field.get(currentId++, oldField.isOptional(), oldField.name(), fieldType, oldField.doc()));
|
||||
}
|
||||
return Types.RecordType.get(internalFields);
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
int elementId = nextId.get();
|
||||
nextId.set(elementId + 1);
|
||||
Type elementType = refreshNewId(array.elementType(), nextId);
|
||||
return Types.ArrayType.get(elementId, array.isElementOptional(), elementType);
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
int keyId = nextId.get();
|
||||
int valueId = keyId + 1;
|
||||
nextId.set(keyId + 2);
|
||||
Type keyType = refreshNewId(map.keyType(), nextId);
|
||||
Type valueType = refreshNewId(map.valueType(), nextId);
|
||||
return Types.MapType.get(keyId, valueId, keyType, valueType, map.isValueOptional());
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* The type of a schema, reference avro schema.
|
||||
* now avro version used by hoodie, not support localTime.
|
||||
* to do add support for localTime if avro version is updated
|
||||
*/
|
||||
public interface Type extends Serializable {
|
||||
enum TypeID {
|
||||
RECORD, ARRAY, MAP, FIXED, STRING, BINARY,
|
||||
INT, LONG, FLOAT, DOUBLE, DATE, BOOLEAN, TIME, TIMESTAMP, DECIMAL, UUID;
|
||||
private String name;
|
||||
TypeID() {
|
||||
this.name = this.name().toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
||||
static TypeID fromValue(String value) {
|
||||
try {
|
||||
return TypeID.valueOf(value.toUpperCase(Locale.ROOT));
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(String.format("Invalid value of Type: %s", value));
|
||||
}
|
||||
}
|
||||
|
||||
TypeID typeId();
|
||||
|
||||
default boolean isNestedType() {
|
||||
return false;
|
||||
}
|
||||
|
||||
abstract class PrimitiveType implements Type {
|
||||
@Override
|
||||
public boolean isNestedType() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
abstract class NestedType implements Type {
|
||||
|
||||
@Override
|
||||
public boolean isNestedType() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public abstract List<Types.Field> fields();
|
||||
|
||||
public abstract Type fieldType(String name);
|
||||
|
||||
public abstract Types.Field field(int id);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,716 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema;
|
||||
|
||||
import org.apache.hudi.internal.schema.Type.PrimitiveType;
|
||||
import org.apache.hudi.internal.schema.Type.NestedType;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class Types {
|
||||
private Types() {
|
||||
}
|
||||
|
||||
public static class BooleanType extends PrimitiveType {
|
||||
private static final BooleanType INSTANCE = new BooleanType();
|
||||
|
||||
public static BooleanType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return Type.TypeID.BOOLEAN;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "boolean";
|
||||
}
|
||||
}
|
||||
|
||||
public static class IntType extends PrimitiveType {
|
||||
private static final IntType INSTANCE = new IntType();
|
||||
|
||||
public static IntType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.INT;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "int";
|
||||
}
|
||||
}
|
||||
|
||||
public static class LongType extends PrimitiveType {
|
||||
private static final LongType INSTANCE = new LongType();
|
||||
|
||||
public static LongType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.LONG;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "long";
|
||||
}
|
||||
}
|
||||
|
||||
public static class FloatType extends PrimitiveType {
|
||||
private static final FloatType INSTANCE = new FloatType();
|
||||
|
||||
public static FloatType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.FLOAT;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "float";
|
||||
}
|
||||
}
|
||||
|
||||
public static class DoubleType extends PrimitiveType {
|
||||
private static final DoubleType INSTANCE = new DoubleType();
|
||||
|
||||
public static DoubleType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.DOUBLE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "double";
|
||||
}
|
||||
}
|
||||
|
||||
public static class DateType extends PrimitiveType {
|
||||
private static final DateType INSTANCE = new DateType();
|
||||
|
||||
public static DateType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.DATE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "date";
|
||||
}
|
||||
}
|
||||
|
||||
public static class TimeType extends PrimitiveType {
|
||||
private static final TimeType INSTANCE = new TimeType();
|
||||
|
||||
public static TimeType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
private TimeType() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.TIME;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "time";
|
||||
}
|
||||
}
|
||||
|
||||
public static class TimestampType extends PrimitiveType {
|
||||
private static final TimestampType INSTANCE = new TimestampType();
|
||||
|
||||
public static TimestampType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
private TimestampType() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.TIMESTAMP;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "timestamp";
|
||||
}
|
||||
}
|
||||
|
||||
public static class StringType extends PrimitiveType {
|
||||
private static final StringType INSTANCE = new StringType();
|
||||
|
||||
public static StringType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.STRING;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "string";
|
||||
}
|
||||
}
|
||||
|
||||
public static class BinaryType extends PrimitiveType {
|
||||
private static final BinaryType INSTANCE = new BinaryType();
|
||||
|
||||
public static BinaryType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.BINARY;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "binary";
|
||||
}
|
||||
}
|
||||
|
||||
public static class FixedType extends PrimitiveType {
|
||||
public static FixedType getFixed(int size) {
|
||||
return new FixedType(size);
|
||||
}
|
||||
|
||||
private final int size;
|
||||
|
||||
private FixedType(int length) {
|
||||
this.size = length;
|
||||
}
|
||||
|
||||
public int getFixedSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.FIXED;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("fixed[%d]", size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof FixedType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
FixedType fixedType = (FixedType) o;
|
||||
return size == fixedType.size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(FixedType.class, size);
|
||||
}
|
||||
}
|
||||
|
||||
public static class DecimalType extends PrimitiveType {
|
||||
public static DecimalType get(int precision, int scale) {
|
||||
return new DecimalType(precision, scale);
|
||||
}
|
||||
|
||||
private final int scale;
|
||||
private final int precision;
|
||||
|
||||
private DecimalType(int precision, int scale) {
|
||||
this.scale = scale;
|
||||
this.precision = precision;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether this DecimalType is wider than `other`. If yes, it means `other`
|
||||
* can be casted into `this` safely without losing any precision or range.
|
||||
*/
|
||||
public boolean isWiderThan(PrimitiveType other) {
|
||||
if (other instanceof DecimalType) {
|
||||
DecimalType dt = (DecimalType) other;
|
||||
return (precision - scale) >= (dt.precision - dt.scale) && scale > dt.scale;
|
||||
}
|
||||
if (other instanceof IntType) {
|
||||
return isWiderThan(get(10, 0));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether this DecimalType is tighter than `other`. If yes, it means `this`
|
||||
* can be casted into `other` safely without losing any precision or range.
|
||||
*/
|
||||
public boolean isTighterThan(PrimitiveType other) {
|
||||
if (other instanceof DecimalType) {
|
||||
DecimalType dt = (DecimalType) other;
|
||||
return (precision - scale) <= (dt.precision - dt.scale) && scale <= dt.scale;
|
||||
}
|
||||
if (other instanceof IntType) {
|
||||
return isTighterThan(get(10, 0));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int scale() {
|
||||
return scale;
|
||||
}
|
||||
|
||||
public int precision() {
|
||||
return precision;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.DECIMAL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("decimal(%d, %d)", precision, scale);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof DecimalType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DecimalType that = (DecimalType) o;
|
||||
if (scale != that.scale) {
|
||||
return false;
|
||||
}
|
||||
return precision == that.precision;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(DecimalType.class, scale, precision);
|
||||
}
|
||||
}
|
||||
|
||||
public static class UUIDType extends PrimitiveType {
|
||||
private static final UUIDType INSTANCE = new UUIDType();
|
||||
|
||||
public static UUIDType get() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.UUID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "uuid";
|
||||
}
|
||||
}
|
||||
|
||||
/** A field within a record. */
|
||||
public static class Field implements Serializable {
|
||||
// Experimental method to support defaultValue
|
||||
public static Field get(int id, boolean isOptional, String name, Type type, String doc, Object defaultValue) {
|
||||
return new Field(isOptional, id, name, type, doc, defaultValue);
|
||||
}
|
||||
|
||||
public static Field get(int id, boolean isOptional, String name, Type type, String doc) {
|
||||
return new Field(isOptional, id, name, type, doc, null);
|
||||
}
|
||||
|
||||
public static Field get(int id, boolean isOptional, String name, Type type) {
|
||||
return new Field(isOptional, id, name, type, null, null);
|
||||
}
|
||||
|
||||
public static Field get(int id, String name, Type type) {
|
||||
return new Field(true, id, name, type, null, null);
|
||||
}
|
||||
|
||||
private final boolean isOptional;
|
||||
private final int id;
|
||||
private final String name;
|
||||
private final Type type;
|
||||
private final String doc;
|
||||
// Experimental properties
|
||||
private final Object defaultValue;
|
||||
|
||||
private Field(boolean isOptional, int id, String name, Type type, String doc, Object defaultValue) {
|
||||
this.isOptional = isOptional;
|
||||
this.id = id;
|
||||
this.name = name;
|
||||
this.type = type;
|
||||
this.doc = doc;
|
||||
this.defaultValue = defaultValue;
|
||||
}
|
||||
|
||||
public Object getDefaultValue() {
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
public boolean isOptional() {
|
||||
return isOptional;
|
||||
}
|
||||
|
||||
public int fieldId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public Type type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public String doc() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("%d: %s: %s %s",
|
||||
id, name, isOptional ? "optional" : "required", type) + (doc != null ? " (" + doc + ")" : "");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof Field)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Field that = (Field) o;
|
||||
if (isOptional != that.isOptional) {
|
||||
return false;
|
||||
} else if (id != that.id) {
|
||||
return false;
|
||||
} else if (!name.equals(that.name)) {
|
||||
return false;
|
||||
} else if (!Objects.equals(doc, that.doc)) {
|
||||
return false;
|
||||
}
|
||||
return type.equals(that.type);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(Field.class, id, isOptional, name, type);
|
||||
}
|
||||
}
|
||||
|
||||
public static class RecordType extends NestedType {
|
||||
|
||||
public static RecordType get(List<Field> fields) {
|
||||
return new RecordType(fields);
|
||||
}
|
||||
|
||||
public static RecordType get(Field... fields) {
|
||||
return new RecordType(Arrays.asList(fields));
|
||||
}
|
||||
|
||||
private final Field[] fields;
|
||||
|
||||
private transient Map<String, Field> nameToFields = null;
|
||||
private transient Map<Integer, Field> idToFields = null;
|
||||
|
||||
private RecordType(List<Field> fields) {
|
||||
this.fields = new Field[fields.size()];
|
||||
for (int i = 0; i < this.fields.length; i += 1) {
|
||||
this.fields[i] = fields.get(i);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Field> fields() {
|
||||
return Arrays.asList(fields);
|
||||
}
|
||||
|
||||
public Field field(String name) {
|
||||
if (nameToFields == null) {
|
||||
nameToFields = new HashMap<>();
|
||||
for (Field field : fields) {
|
||||
nameToFields.put(field.name().toLowerCase(Locale.ROOT), field);
|
||||
}
|
||||
}
|
||||
return nameToFields.get(name.toLowerCase(Locale.ROOT));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Field field(int id) {
|
||||
if (idToFields == null) {
|
||||
idToFields = new HashMap<>();
|
||||
for (Field field : fields) {
|
||||
idToFields.put(field.fieldId(), field);
|
||||
}
|
||||
}
|
||||
return idToFields.get(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Type fieldType(String name) {
|
||||
Field field = field(name);
|
||||
if (field != null) {
|
||||
return field.type();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.RECORD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Record<%s>", Arrays.stream(fields).map(f -> f.toString()).collect(Collectors.joining("-")));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof RecordType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
RecordType that = (RecordType) o;
|
||||
return Arrays.equals(fields, that.fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(Field.class, Arrays.hashCode(fields));
|
||||
}
|
||||
}
|
||||
|
||||
public static class ArrayType extends NestedType {
|
||||
public static ArrayType get(int elementId, boolean isOptional, Type elementType) {
|
||||
return new ArrayType(Field.get(elementId, isOptional,"element", elementType));
|
||||
}
|
||||
|
||||
private final Field elementField;
|
||||
|
||||
private ArrayType(Field elementField) {
|
||||
this.elementField = elementField;
|
||||
}
|
||||
|
||||
public Type elementType() {
|
||||
return elementField.type();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Type fieldType(String name) {
|
||||
if ("element".equals(name)) {
|
||||
return elementType();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Field field(int id) {
|
||||
if (elementField.fieldId() == id) {
|
||||
return elementField;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Field> fields() {
|
||||
return Arrays.asList(elementField);
|
||||
}
|
||||
|
||||
public int elementId() {
|
||||
return elementField.fieldId();
|
||||
}
|
||||
|
||||
public boolean isElementOptional() {
|
||||
return elementField.isOptional;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.ARRAY;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("list<%s>", elementField.type());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof ArrayType)) {
|
||||
return false;
|
||||
}
|
||||
ArrayType listType = (ArrayType) o;
|
||||
return elementField.equals(listType.elementField);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(ArrayType.class, elementField);
|
||||
}
|
||||
}
|
||||
|
||||
public static class MapType extends NestedType {
|
||||
|
||||
public static MapType get(int keyId, int valueId, Type keyType, Type valueType) {
|
||||
return new MapType(
|
||||
Field.get(keyId, "key", keyType),
|
||||
Field.get(valueId, "value", valueType));
|
||||
}
|
||||
|
||||
public static MapType get(int keyId, int valueId, Type keyType, Type valueType, boolean isOptional) {
|
||||
return new MapType(
|
||||
Field.get(keyId, isOptional, "key", keyType),
|
||||
Field.get(valueId, isOptional, "value", valueType));
|
||||
}
|
||||
|
||||
private final Field keyField;
|
||||
private final Field valueField;
|
||||
private transient List<Field> fields = null;
|
||||
|
||||
private MapType(Field keyField, Field valueField) {
|
||||
this.keyField = keyField;
|
||||
this.valueField = valueField;
|
||||
}
|
||||
|
||||
public Type keyType() {
|
||||
return keyField.type();
|
||||
}
|
||||
|
||||
public Type valueType() {
|
||||
return valueField.type();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Type fieldType(String name) {
|
||||
if ("key".equals(name)) {
|
||||
return keyField.type();
|
||||
} else if ("value".equals(name)) {
|
||||
return valueField.type();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Field field(int id) {
|
||||
if (keyField.fieldId() == id) {
|
||||
return keyField;
|
||||
} else if (valueField.fieldId() == id) {
|
||||
return valueField;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Field> fields() {
|
||||
if (fields == null) {
|
||||
fields = Arrays.asList(keyField, valueField);
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
|
||||
public int keyId() {
|
||||
return keyField.fieldId();
|
||||
}
|
||||
|
||||
public int valueId() {
|
||||
return valueField.fieldId();
|
||||
}
|
||||
|
||||
public boolean isValueOptional() {
|
||||
return valueField.isOptional;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TypeID typeId() {
|
||||
return TypeID.MAP;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("map<%s, %s>", keyField.type(), valueField.type());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
} else if (!(o instanceof MapType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
MapType mapType = (MapType) o;
|
||||
if (!keyField.equals(mapType.keyField)) {
|
||||
return false;
|
||||
}
|
||||
return valueField.equals(mapType.valueField);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(MapType.class, keyField, valueField);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.utils.SchemaChangeUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Manage schema change for HoodieWriteClient.
|
||||
*/
|
||||
public class InternalSchemaChangeApplier {
|
||||
private InternalSchema latestSchema;
|
||||
|
||||
public InternalSchemaChangeApplier(InternalSchema latestSchema) {
|
||||
this.latestSchema = latestSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add columns to table.
|
||||
*
|
||||
* @param colName col name to be added. if we want to add col to a nested filed, the fullName should be specify
|
||||
* @param colType col type to be added.
|
||||
* @param doc col doc to be added.
|
||||
* @param position col position to be added
|
||||
* @param positionType col position change type. now support three change types: first/after/before
|
||||
*/
|
||||
public InternalSchema applyAddChange(
|
||||
String colName,
|
||||
Type colType,
|
||||
String doc,
|
||||
String position,
|
||||
TableChange.ColumnPositionChange.ColumnPositionType positionType) {
|
||||
TableChanges.ColumnAddChange add = TableChanges.ColumnAddChange.get(latestSchema);
|
||||
String parentName = TableChangesHelper.getParentName(colName);
|
||||
add.addColumns(parentName, colName, colType, doc);
|
||||
if (positionType != null) {
|
||||
switch (positionType) {
|
||||
case NO_OPERATION:
|
||||
break;
|
||||
case FIRST:
|
||||
add.addPositionChange(colName, "", positionType);
|
||||
break;
|
||||
case AFTER:
|
||||
case BEFORE:
|
||||
if (position == null || position.isEmpty()) {
|
||||
throw new IllegalArgumentException("position should not be null/empty_string when specify positionChangeType as after/before");
|
||||
}
|
||||
String referParentName = TableChangesHelper.getParentName(position);
|
||||
if (!parentName.equals(referParentName)) {
|
||||
throw new IllegalArgumentException("cannot reorder two columns which has different parent");
|
||||
}
|
||||
add.addPositionChange(colName, position, positionType);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException(String.format("only support first/before/after but found: %s", positionType));
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format("positionType should be specified"));
|
||||
}
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, add);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete columns to table.
|
||||
*
|
||||
* @param colNames col name to be deleted. if we want to delete col from a nested filed, the fullName should be specify
|
||||
*/
|
||||
public InternalSchema applyDeleteChange(String... colNames) {
|
||||
TableChanges.ColumnDeleteChange delete = TableChanges.ColumnDeleteChange.get(latestSchema);
|
||||
Arrays.stream(colNames).forEach(colName -> delete.deleteColumn(colName));
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, delete);
|
||||
}
|
||||
|
||||
/**
|
||||
* Rename col name for hudi table.
|
||||
*
|
||||
* @param colName col name to be renamed. if we want to rename col from a nested filed, the fullName should be specify
|
||||
* @param newName new name for current col. no need to specify fullName.
|
||||
*/
|
||||
public InternalSchema applyRenameChange(String colName, String newName) {
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema);
|
||||
updateChange.renameColumn(colName, newName);
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update col nullability for hudi table.
|
||||
*
|
||||
* @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify
|
||||
* @param nullable .
|
||||
*/
|
||||
public InternalSchema applyColumnNullabilityChange(String colName, boolean nullable) {
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema);
|
||||
updateChange.updateColumnNullability(colName, nullable);
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update col type for hudi table.
|
||||
*
|
||||
* @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify
|
||||
* @param newType .
|
||||
*/
|
||||
public InternalSchema applyColumnTypeChange(String colName, Type newType) {
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema);
|
||||
updateChange.updateColumnType(colName, newType);
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update col comment for hudi table.
|
||||
*
|
||||
* @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify
|
||||
* @param doc .
|
||||
*/
|
||||
public InternalSchema applyColumnCommentChange(String colName, String doc) {
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema);
|
||||
updateChange.updateColumnComment(colName, doc);
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reorder the position of col.
|
||||
*
|
||||
* @param colName column which need to be reordered. if we want to change col from a nested filed, the fullName should be specify.
|
||||
* @param referColName reference position.
|
||||
* @param positionType col position change type. now support three change types: first/after/before
|
||||
*/
|
||||
public InternalSchema applyReOrderColPositionChange(
|
||||
String colName,
|
||||
String referColName,
|
||||
TableChange.ColumnPositionChange.ColumnPositionType positionType) {
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(latestSchema);
|
||||
String parentName = TableChangesHelper.getParentName(colName);
|
||||
String referParentName = TableChangesHelper.getParentName(referColName);
|
||||
if (positionType.equals(TableChange.ColumnPositionChange.ColumnPositionType.FIRST)) {
|
||||
updateChange.addPositionChange(colName, "", positionType);
|
||||
} else if (parentName.equals(referParentName)) {
|
||||
updateChange.addPositionChange(colName, referColName, positionType);
|
||||
} else {
|
||||
throw new IllegalArgumentException("cannot reorder two columns which has different parent");
|
||||
}
|
||||
return SchemaChangeUtils.applyTableChanges2Schema(latestSchema, updateChange);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Auxiliary class.
|
||||
* help to merge file schema and query schema to produce final read schema for avro/parquet file
|
||||
*/
|
||||
public class InternalSchemaMerger {
|
||||
private final InternalSchema fileSchema;
|
||||
private final InternalSchema querySchema;
|
||||
// now there exist some bugs when we use spark update/merge api,
|
||||
// those operation will change col nullability from optional to required which is wrong.
|
||||
// Before that bug is fixed, we need to do adapt.
|
||||
// if mergeRequiredFiledForce is true, we will ignore the col's required attribute.
|
||||
private final boolean ignoreRequiredAttribute;
|
||||
// Whether to use column Type from file schema to read files when we find some column type has changed.
|
||||
// spark parquetReader need the original column type to read data, otherwise the parquetReader will failed.
|
||||
// eg: current column type is StringType, now we changed it to decimalType,
|
||||
// we should not pass decimalType to parquetReader, we must pass StringType to it; when we read out the data, we convert data from String to Decimal, everything is ok.
|
||||
// for log reader
|
||||
// since our reWriteRecordWithNewSchema function support rewrite directly, so we no need this parameter
|
||||
// eg: current column type is StringType, now we changed it to decimalType,
|
||||
// we can pass decimalType to reWriteRecordWithNewSchema directly, everything is ok.
|
||||
private boolean useColumnTypeFromFileSchema = true;
|
||||
|
||||
public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema) {
|
||||
this.fileSchema = fileSchema;
|
||||
this.querySchema = querySchema;
|
||||
this.ignoreRequiredAttribute = ignoreRequiredAttribute;
|
||||
this.useColumnTypeFromFileSchema = useColumnTypeFromFileSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create final read schema to read avro/parquet file.
|
||||
*
|
||||
* @return read schema to read avro/parquet file.
|
||||
*/
|
||||
public InternalSchema mergeSchema() {
|
||||
Types.RecordType record = (Types.RecordType) mergeType(querySchema.getRecord(), 0);
|
||||
return new InternalSchema(record.fields());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create final read schema to read avro/parquet file.
|
||||
* this is auxiliary function used by mergeSchema.
|
||||
*/
|
||||
private Type mergeType(Type type, int currentTypeId) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Type> newTypes = new ArrayList<>();
|
||||
for (Types.Field f : record.fields()) {
|
||||
Type newType = mergeType(f.type(), f.fieldId());
|
||||
newTypes.add(newType);
|
||||
}
|
||||
return Types.RecordType.get(buildRecordType(record.fields(), newTypes));
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Type newElementType;
|
||||
Types.Field elementField = array.fields().get(0);
|
||||
newElementType = mergeType(elementField.type(), elementField.fieldId());
|
||||
return buildArrayType(array, newElementType);
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
Type newValueType = mergeType(map.valueType(), map.valueId());
|
||||
return buildMapType(map, newValueType);
|
||||
default:
|
||||
return buildPrimitiveType((Type.PrimitiveType) type, currentTypeId);
|
||||
}
|
||||
}
|
||||
|
||||
private List<Types.Field> buildRecordType(List<Types.Field> oldFields, List<Type> newTypes) {
|
||||
List<Types.Field> newFields = new ArrayList<>();
|
||||
for (int i = 0; i < newTypes.size(); i++) {
|
||||
Type newType = newTypes.get(i);
|
||||
Types.Field oldField = oldFields.get(i);
|
||||
int fieldId = oldField.fieldId();
|
||||
String fullName = querySchema.findfullName(fieldId);
|
||||
if (fileSchema.findField(fieldId) != null) {
|
||||
if (fileSchema.findfullName(fieldId).equals(fullName)) {
|
||||
// maybe col type changed, deal with it.
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
|
||||
} else {
|
||||
// find rename, deal with it.
|
||||
newFields.add(dealWithRename(fieldId, newType, oldField));
|
||||
}
|
||||
} else {
|
||||
// buildFullName
|
||||
fullName = normalizeFullName(fullName);
|
||||
if (fileSchema.findField(fullName) != null) {
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name() + "suffix", oldField.type(), oldField.doc()));
|
||||
} else {
|
||||
// find add column
|
||||
// now there exist some bugs when we use spark update/merge api, those operation will change col optional to required.
|
||||
if (ignoreRequiredAttribute) {
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), true, oldField.name(), newType, oldField.doc()));
|
||||
} else {
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return newFields;
|
||||
}
|
||||
|
||||
private Types.Field dealWithRename(int fieldId, Type newType, Types.Field oldField) {
|
||||
Types.Field fieldFromFileSchema = fileSchema.findField(fieldId);
|
||||
String nameFromFileSchema = fieldFromFileSchema.name();
|
||||
Type typeFromFileSchema = fieldFromFileSchema.type();
|
||||
// Current design mechanism guarantees nestedType change is not allowed, so no need to consider.
|
||||
if (newType.isNestedType()) {
|
||||
return Types.Field.get(oldField.fieldId(), oldField.isOptional(), nameFromFileSchema, newType, oldField.doc());
|
||||
} else {
|
||||
return Types.Field.get(oldField.fieldId(), oldField.isOptional(), nameFromFileSchema, useColumnTypeFromFileSchema ? typeFromFileSchema : newType, oldField.doc());
|
||||
}
|
||||
}
|
||||
|
||||
private String normalizeFullName(String fullName) {
|
||||
// find parent rename, and normalize fullName
|
||||
// eg: we renamed a nest field struct(c, d) to aa, the we delete a.d and add it back later.
|
||||
String[] nameParts = fullName.split("\\.");
|
||||
String[] normalizedNameParts = new String[nameParts.length];
|
||||
System.arraycopy(nameParts, 0, normalizedNameParts, 0, nameParts.length);
|
||||
for (int j = 0; j < nameParts.length - 1; j++) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int k = 0; k <= j; k++) {
|
||||
sb.append(nameParts[k]);
|
||||
}
|
||||
String parentName = sb.toString();
|
||||
int parentFieldIdFromQuerySchema = querySchema.findIdByName(parentName);
|
||||
String parentNameFromFileSchema = fileSchema.findfullName(parentFieldIdFromQuerySchema);
|
||||
if (parentNameFromFileSchema.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
if (!parentNameFromFileSchema.equalsIgnoreCase(parentName)) {
|
||||
// find parent rename, update nameParts
|
||||
String[] parentNameParts = parentNameFromFileSchema.split("\\.");
|
||||
System.arraycopy(parentNameParts, 0, normalizedNameParts, 0, parentNameParts.length);
|
||||
}
|
||||
}
|
||||
return StringUtils.join(normalizedNameParts, ".");
|
||||
}
|
||||
|
||||
private Type buildArrayType(Types.ArrayType array, Type newType) {
|
||||
Types.Field elementField = array.fields().get(0);
|
||||
int elementId = elementField.fieldId();
|
||||
if (elementField.type() == newType) {
|
||||
return array;
|
||||
} else {
|
||||
return Types.ArrayType.get(elementId, elementField.isOptional(), newType);
|
||||
}
|
||||
}
|
||||
|
||||
private Type buildMapType(Types.MapType map, Type newValue) {
|
||||
Types.Field valueFiled = map.fields().get(1);
|
||||
if (valueFiled.type() == newValue) {
|
||||
return map;
|
||||
} else {
|
||||
return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValue, map.isValueOptional());
|
||||
}
|
||||
}
|
||||
|
||||
private Type buildPrimitiveType(Type.PrimitiveType typeFromQuerySchema, int currentPrimitiveTypeId) {
|
||||
Type typeFromFileSchema = fileSchema.findType(currentPrimitiveTypeId);
|
||||
if (typeFromFileSchema == null) {
|
||||
return typeFromQuerySchema;
|
||||
} else {
|
||||
return useColumnTypeFromFileSchema ? typeFromFileSchema : typeFromQuerySchema;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,252 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.internal.schema.HoodieSchemaException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.InternalSchemaBuilder;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* TableChange subclasses represent requested changes to a table.
|
||||
* now only column changes support.
|
||||
* to do support partition changes
|
||||
*/
|
||||
public interface TableChange {
|
||||
/* The action Type of schema change. */
|
||||
enum ColumnChangeID {
|
||||
ADD, UPDATE, DELETE, PROPERTY_CHANGE, REPLACE;
|
||||
private String name;
|
||||
|
||||
private ColumnChangeID() {
|
||||
this.name = this.name().toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
||||
static ColumnChangeID fromValue(String value) {
|
||||
switch (value.toLowerCase(Locale.ROOT)) {
|
||||
case "add":
|
||||
return ColumnChangeID.ADD;
|
||||
case "change":
|
||||
return ColumnChangeID.UPDATE;
|
||||
case "delete":
|
||||
return ColumnChangeID.DELETE;
|
||||
case "property":
|
||||
return ColumnChangeID.PROPERTY_CHANGE;
|
||||
case "replace":
|
||||
return ColumnChangeID.REPLACE;
|
||||
default:
|
||||
throw new IllegalArgumentException("Invalid value of Type.");
|
||||
}
|
||||
}
|
||||
|
||||
ColumnChangeID columnChangeId();
|
||||
|
||||
default boolean withPositionChange() {
|
||||
return false;
|
||||
}
|
||||
|
||||
abstract class BaseColumnChange implements TableChange {
|
||||
protected final InternalSchema internalSchema;
|
||||
protected final Map<Integer, Integer> id2parent;
|
||||
protected final Map<Integer, ArrayList<ColumnPositionChange>> positionChangeMap = new HashMap<>();
|
||||
|
||||
BaseColumnChange(InternalSchema schema) {
|
||||
this.internalSchema = schema;
|
||||
this.id2parent = InternalSchemaBuilder.getBuilder().index2Parents(schema.getRecord());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add position change.
|
||||
*
|
||||
* @param srcName column which need to be reordered
|
||||
* @param dsrName reference position
|
||||
* @param orderType change types
|
||||
* @return this
|
||||
*/
|
||||
public BaseColumnChange addPositionChange(String srcName, String dsrName, ColumnPositionChange.ColumnPositionType orderType) {
|
||||
Integer srcId = findIdByFullName(srcName);
|
||||
Option<Integer> dsrIdOpt = dsrName.isEmpty() ? Option.empty() : Option.of(findIdByFullName(dsrName));
|
||||
Integer srcParentId = id2parent.get(srcId);
|
||||
Option<Integer> dsrParentIdOpt = dsrIdOpt.map(id2parent::get);
|
||||
// forbid adjust hoodie metadata columns.
|
||||
switch (orderType) {
|
||||
case BEFORE:
|
||||
checkColModifyIsLegal(dsrName);
|
||||
break;
|
||||
case FIRST:
|
||||
if (srcId == null || srcId == -1 || srcParentId == null || srcParentId == -1) {
|
||||
throw new HoodieSchemaException("forbid adjust top-level columns position by using through first syntax");
|
||||
}
|
||||
break;
|
||||
case AFTER:
|
||||
List<String> checkColumns = HoodieRecord.HOODIE_META_COLUMNS.subList(0, HoodieRecord.HOODIE_META_COLUMNS.size() - 2);
|
||||
if (checkColumns.stream().anyMatch(f -> f.equalsIgnoreCase(dsrName))) {
|
||||
throw new HoodieSchemaException("forbid adjust the position of ordinary columns between meta columns");
|
||||
}
|
||||
break;
|
||||
case NO_OPERATION:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
int parentId;
|
||||
if (srcParentId != null && dsrParentIdOpt.isPresent() && srcParentId.equals(dsrParentIdOpt.get())) {
|
||||
Types.Field parentField = internalSchema.findField(srcParentId);
|
||||
if (!(parentField.type() instanceof Types.RecordType)) {
|
||||
throw new HoodieSchemaException(String.format("only support reorder fields in struct type, but find: %s", parentField.type()));
|
||||
}
|
||||
parentId = parentField.fieldId();
|
||||
} else if (srcParentId == null && !dsrParentIdOpt.isPresent()) {
|
||||
parentId = -1;
|
||||
} else if (srcParentId != null && !dsrParentIdOpt.isPresent() && orderType.equals(ColumnPositionChange.ColumnPositionType.FIRST)) {
|
||||
parentId = srcParentId;
|
||||
} else {
|
||||
throw new HoodieSchemaException("cannot order position from different parent");
|
||||
}
|
||||
|
||||
ArrayList<ColumnPositionChange> changes = positionChangeMap.getOrDefault(parentId, new ArrayList<>());
|
||||
changes.add(ColumnPositionChange.get(srcId, dsrIdOpt.orElse(-1), orderType));
|
||||
positionChangeMap.put(parentId, changes);
|
||||
return this;
|
||||
}
|
||||
|
||||
public BaseColumnChange addPositionChange(String srcName, String dsrName, String orderType) {
|
||||
return addPositionChange(srcName, dsrName, ColumnPositionChange.fromTypeValue(orderType));
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract method.
|
||||
* give a column fullName and return the field id
|
||||
*
|
||||
* @param fullName column fullName
|
||||
* @return field id of current column
|
||||
*/
|
||||
protected abstract Integer findIdByFullName(String fullName);
|
||||
|
||||
// Modify hudi meta columns is prohibited
|
||||
protected void checkColModifyIsLegal(String colNeedToModfiy) {
|
||||
if (HoodieRecord.HOODIE_META_COLUMNS.stream().anyMatch(f -> f.equalsIgnoreCase(colNeedToModfiy))) {
|
||||
throw new IllegalArgumentException(String.format("cannot modify hudi meta col: %s", colNeedToModfiy));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean withPositionChange() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Column position change.
|
||||
* now support three change types: FIRST/AFTER/BEFORE
|
||||
* FIRST means the specified column should be the first column.
|
||||
* AFTER means the specified column should be put after the given column.
|
||||
* BEFORE means the specified column should be put before the given column.
|
||||
* Note that, the specified column may be a nested field:
|
||||
* AFTER/BEFORE means the given columns should in the same struct;
|
||||
* FIRST means this field should be the first one within the struct.
|
||||
*/
|
||||
class ColumnPositionChange {
|
||||
public enum ColumnPositionType {
|
||||
FIRST,
|
||||
BEFORE,
|
||||
AFTER,
|
||||
// only expose to internal use.
|
||||
NO_OPERATION
|
||||
}
|
||||
|
||||
static ColumnPositionType fromTypeValue(String value) {
|
||||
switch (value.toLowerCase(Locale.ROOT)) {
|
||||
case "first":
|
||||
return ColumnPositionType.FIRST;
|
||||
case "before":
|
||||
return ColumnPositionType.BEFORE;
|
||||
case "after":
|
||||
return ColumnPositionType.AFTER;
|
||||
case "no_operation":
|
||||
return ColumnPositionType.NO_OPERATION;
|
||||
default:
|
||||
throw new IllegalArgumentException(String.format("only support first/before/after but found: %s", value));
|
||||
}
|
||||
}
|
||||
|
||||
private final int srcId;
|
||||
private final int dsrId;
|
||||
private final ColumnPositionType type;
|
||||
|
||||
static ColumnPositionChange first(int srcId) {
|
||||
return new ColumnPositionChange(srcId, -1, ColumnPositionType.FIRST);
|
||||
}
|
||||
|
||||
static ColumnPositionChange before(int srcId, int dsrId) {
|
||||
return new ColumnPositionChange(srcId, dsrId, ColumnPositionType.BEFORE);
|
||||
}
|
||||
|
||||
static ColumnPositionChange after(int srcId, int dsrId) {
|
||||
return new ColumnPositionChange(srcId, dsrId, ColumnPositionType.AFTER);
|
||||
}
|
||||
|
||||
static ColumnPositionChange get(int srcId, int dsrId, String type) {
|
||||
return get(srcId, dsrId, fromTypeValue(type));
|
||||
}
|
||||
|
||||
static ColumnPositionChange get(int srcId, int dsrId, ColumnPositionType type) {
|
||||
switch (type) {
|
||||
case FIRST:
|
||||
return ColumnPositionChange.first(srcId);
|
||||
case BEFORE:
|
||||
return ColumnPositionChange.before(srcId, dsrId);
|
||||
case AFTER:
|
||||
return ColumnPositionChange.after(srcId, dsrId);
|
||||
default:
|
||||
throw new IllegalArgumentException(String.format("only support first/before/after but found: %s", type));
|
||||
}
|
||||
}
|
||||
|
||||
private ColumnPositionChange(int srcId, int dsrId, ColumnPositionType type) {
|
||||
this.srcId = srcId;
|
||||
this.dsrId = dsrId;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public int getSrcId() {
|
||||
return srcId;
|
||||
}
|
||||
|
||||
public int getDsrId() {
|
||||
return dsrId;
|
||||
}
|
||||
|
||||
public ColumnPositionType type() {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.internal.schema.HoodieSchemaException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.InternalSchemaBuilder;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.utils.SchemaChangeUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class TableChanges {
|
||||
|
||||
/** Deal with update columns changes for table. */
|
||||
public static class ColumnUpdateChange extends TableChange.BaseColumnChange {
|
||||
private final Map<Integer, Types.Field> updates = new HashMap<>();
|
||||
|
||||
public static ColumnUpdateChange get(InternalSchema schema) {
|
||||
return new ColumnUpdateChange(schema);
|
||||
}
|
||||
|
||||
private ColumnUpdateChange(InternalSchema schema) {
|
||||
super(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean withPositionChange() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public Type applyUpdates(Types.Field oldField, Type type) {
|
||||
Types.Field update = updates.get(oldField.fieldId());
|
||||
if (update != null && update.type() != oldField.type()) {
|
||||
return update.type();
|
||||
}
|
||||
//
|
||||
ArrayList<ColumnPositionChange> pchanges = positionChangeMap.getOrDefault(oldField.fieldId(), new ArrayList<>());
|
||||
if (!pchanges.isEmpty()) {
|
||||
// when we build ColumnAddChange,we have already done some check, so it's safe to convert newType to RecordType
|
||||
List<Types.Field> newFields = TableChangesHelper.applyAddChange2Fields(((Types.RecordType) type).fields(), new ArrayList<>(), pchanges);
|
||||
return Types.RecordType.get(newFields);
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
public Map<Integer, Types.Field> getUpdates() {
|
||||
return updates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a column in the schema to a new type.
|
||||
* only support update primitive type.
|
||||
* Only updates that widen types are allowed.
|
||||
*
|
||||
* @param name name of the column to update
|
||||
* @param newType new type for the column
|
||||
* @return this
|
||||
* @throws IllegalArgumentException
|
||||
*/
|
||||
public ColumnUpdateChange updateColumnType(String name, Type newType) {
|
||||
checkColModifyIsLegal(name);
|
||||
if (newType.isNestedType()) {
|
||||
throw new IllegalArgumentException(String.format("only support update primitive type but find nest column: %s", name));
|
||||
}
|
||||
Types.Field field = internalSchema.findField(name);
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name));
|
||||
}
|
||||
|
||||
if (!SchemaChangeUtils.isTypeUpdateAllow(field.type(), newType)) {
|
||||
throw new IllegalArgumentException(String.format("cannot update origin type: %s to a incompatibility type: %s", field.type(), newType));
|
||||
}
|
||||
|
||||
if (field.type().equals(newType)) {
|
||||
// do nothings
|
||||
return this;
|
||||
}
|
||||
// save update info
|
||||
Types.Field update = updates.get(field.fieldId());
|
||||
if (update == null) {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), field.isOptional(), field.name(), newType, field.doc()));
|
||||
} else {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), update.isOptional(), update.name(), newType, update.doc()));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a column doc in the schema to a new primitive type.
|
||||
*
|
||||
* @param name name of the column to update
|
||||
* @param newDoc new documentation for the column
|
||||
* @return this
|
||||
* @throws IllegalArgumentException
|
||||
*/
|
||||
public ColumnUpdateChange updateColumnComment(String name, String newDoc) {
|
||||
checkColModifyIsLegal(name);
|
||||
Types.Field field = internalSchema.findField(name);
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name));
|
||||
}
|
||||
// consider null
|
||||
if (Objects.equals(field.doc(), newDoc)) {
|
||||
// do nothings
|
||||
return this;
|
||||
}
|
||||
// save update info
|
||||
Types.Field update = updates.get(field.fieldId());
|
||||
if (update == null) {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), field.isOptional(), field.name(), field.type(), newDoc));
|
||||
} else {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), update.isOptional(), update.name(), update.type(), newDoc));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rename a column in the schema.
|
||||
*
|
||||
* @param name name of the column to rename
|
||||
* @param newName new name for the column
|
||||
* @return this
|
||||
* @throws IllegalArgumentException
|
||||
*/
|
||||
public ColumnUpdateChange renameColumn(String name, String newName) {
|
||||
checkColModifyIsLegal(name);
|
||||
Types.Field field = internalSchema.findField(name);
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name));
|
||||
}
|
||||
if (newName == null || newName.isEmpty()) {
|
||||
throw new IllegalArgumentException(String.format("cannot rename column: %s to empty", name));
|
||||
}
|
||||
// keep consisitent with hive. column names insensitive, so we check 'newName.toLowerCase(Locale.ROOT)'
|
||||
if (internalSchema.findDuplicateCol(newName.toLowerCase(Locale.ROOT))) {
|
||||
throw new IllegalArgumentException(String.format("cannot rename column: %s to a existing name", name));
|
||||
}
|
||||
// save update info
|
||||
Types.Field update = updates.get(field.fieldId());
|
||||
if (update == null) {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), field.isOptional(), newName, field.type(), field.doc()));
|
||||
} else {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), update.isOptional(), newName, update.type(), update.doc()));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update nullable for column.
|
||||
* only support required type -> optional type
|
||||
*
|
||||
* @param name name of the column to update
|
||||
* @param nullable nullable for updated name
|
||||
* @return this
|
||||
* @throws IllegalArgumentException
|
||||
*/
|
||||
public ColumnUpdateChange updateColumnNullability(String name, boolean nullable) {
|
||||
return updateColumnNullability(name, nullable, false);
|
||||
}
|
||||
|
||||
public ColumnUpdateChange updateColumnNullability(String name, boolean nullable, boolean force) {
|
||||
checkColModifyIsLegal(name);
|
||||
Types.Field field = internalSchema.findField(name);
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot update a missing column: %s", name));
|
||||
}
|
||||
if (field.isOptional() == nullable) {
|
||||
// do nothings
|
||||
return this;
|
||||
}
|
||||
if (field.isOptional() && !nullable && !force) {
|
||||
throw new IllegalArgumentException("cannot update column Nullability: optional to required");
|
||||
}
|
||||
// save update info
|
||||
Types.Field update = updates.get(field.fieldId());
|
||||
if (update == null) {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), nullable, field.name(), field.type(), field.doc()));
|
||||
} else {
|
||||
updates.put(field.fieldId(), Types.Field.get(field.fieldId(), nullable, update.name(), update.type(), update.doc()));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public Map<Integer, ArrayList<ColumnPositionChange>> getPositionChangeMap() {
|
||||
return positionChangeMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnChangeID columnChangeId() {
|
||||
return ColumnChangeID.UPDATE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Integer findIdByFullName(String fullName) {
|
||||
Types.Field field = internalSchema.findField(fullName);
|
||||
if (field != null) {
|
||||
return field.fieldId();
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format("cannot find col id for given column fullName: %s", fullName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Deal with delete columns changes for table. */
|
||||
public static class ColumnDeleteChange extends TableChange.BaseColumnChange {
|
||||
private final Set deletes = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public ColumnChangeID columnChangeId() {
|
||||
return ColumnChangeID.DELETE;
|
||||
}
|
||||
|
||||
public static ColumnDeleteChange get(InternalSchema schema) {
|
||||
return new ColumnDeleteChange(schema);
|
||||
}
|
||||
|
||||
private ColumnDeleteChange(InternalSchema schema) {
|
||||
super(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean withPositionChange() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BaseColumnChange addPositionChange(String srcId, String dsrId, String orderType) {
|
||||
throw new UnsupportedOperationException("no support add position change for ColumnDeleteChange");
|
||||
}
|
||||
|
||||
public ColumnDeleteChange deleteColumn(String name) {
|
||||
checkColModifyIsLegal(name);
|
||||
Types.Field field = internalSchema.findField(name);
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot delete missing columns: %s", name));
|
||||
}
|
||||
deletes.add(field.fieldId());
|
||||
return this;
|
||||
}
|
||||
|
||||
public Type applyDelete(int id, Type type) {
|
||||
if (deletes.contains(id)) {
|
||||
return null;
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
public Set<Integer> getDeletes() {
|
||||
return deletes;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Integer findIdByFullName(String fullName) {
|
||||
throw new UnsupportedOperationException("delete change cannot support this method");
|
||||
}
|
||||
}
|
||||
|
||||
/** Deal with add columns changes for table. */
|
||||
public static class ColumnAddChange extends TableChange.BaseColumnChange {
|
||||
private final Map<String, Integer> fullColName2Id = new HashMap<>();
|
||||
private final Map<Integer, ArrayList<Types.Field>> parentId2AddCols = new HashMap<>();
|
||||
private int nextId;
|
||||
|
||||
public static ColumnAddChange get(InternalSchema internalSchema) {
|
||||
return new ColumnAddChange(internalSchema);
|
||||
}
|
||||
|
||||
public Type applyAdd(Types.Field orignalField, Type type) {
|
||||
int fieldId = orignalField.fieldId();
|
||||
ArrayList<Types.Field> addFields = parentId2AddCols.getOrDefault(fieldId, new ArrayList<>());
|
||||
ArrayList<ColumnPositionChange> pchanges = positionChangeMap.getOrDefault(fieldId, new ArrayList<>());
|
||||
|
||||
if (!addFields.isEmpty() || !pchanges.isEmpty()) {
|
||||
// when we build ColumnAddChange,we have already done some check, so it's safe to convert newType to RecordType
|
||||
List<Types.Field> newFields = TableChangesHelper.applyAddChange2Fields(((Types.RecordType) type).fields(), addFields, pchanges);
|
||||
return Types.RecordType.get(newFields);
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
public ColumnAddChange addColumns(String name, Type type, String doc) {
|
||||
checkColModifyIsLegal(name);
|
||||
return addColumns("", name, type, doc);
|
||||
}
|
||||
|
||||
public ColumnAddChange addColumns(String parent, String name, Type type, String doc) {
|
||||
checkColModifyIsLegal(name);
|
||||
addColumnsInternal(parent, name, type, doc);
|
||||
return this;
|
||||
}
|
||||
|
||||
private void addColumnsInternal(String parent, String name, Type type, String doc) {
|
||||
// root record has no parent, so set parentId to -1 as default
|
||||
int parentId = -1;
|
||||
// do check
|
||||
String fullName = name;
|
||||
if (!parent.isEmpty()) {
|
||||
Types.Field parentField = internalSchema.findField(parent);
|
||||
if (parentField == null) {
|
||||
throw new HoodieSchemaException(String.format("cannot add column: %s which parent: %s is not exist", name, parent));
|
||||
}
|
||||
Type parentType = parentField.type();
|
||||
if (!(parentField.type() instanceof Types.RecordType)) {
|
||||
throw new HoodieSchemaException("only support add nested columns to struct column");
|
||||
}
|
||||
parentId = parentField.fieldId();
|
||||
Types.Field newParentField = internalSchema.findField(parent + "." + name);
|
||||
if (newParentField != null) {
|
||||
throw new HoodieSchemaException(String.format("cannot add column: %s which already exist", name));
|
||||
}
|
||||
fullName = parent + "." + name;
|
||||
} else {
|
||||
// keep consistent with hive, column name case insensitive
|
||||
if (internalSchema.findDuplicateCol(name.toLowerCase(Locale.ROOT))) {
|
||||
throw new HoodieSchemaException(String.format("cannot add column: %s which already exist", name));
|
||||
}
|
||||
}
|
||||
if (fullColName2Id.containsKey(fullName)) {
|
||||
throw new HoodieSchemaException(String.format("cannot repeat add column: %s", name));
|
||||
}
|
||||
fullColName2Id.put(fullName, nextId);
|
||||
if (parentId != -1) {
|
||||
id2parent.put(nextId, parentId);
|
||||
}
|
||||
AtomicInteger assignNextId = new AtomicInteger(nextId + 1);
|
||||
Type typeWithNewId = InternalSchemaBuilder.getBuilder().refreshNewId(type, assignNextId);
|
||||
// only allow add optional columns.
|
||||
ArrayList<Types.Field> adds = parentId2AddCols.getOrDefault(parentId, new ArrayList<>());
|
||||
adds.add(Types.Field.get(nextId, true, name, typeWithNewId, doc));
|
||||
parentId2AddCols.put(parentId, adds);
|
||||
nextId = assignNextId.get();
|
||||
}
|
||||
|
||||
private ColumnAddChange(InternalSchema internalSchema) {
|
||||
super(internalSchema);
|
||||
this.nextId = internalSchema.getMaxColumnId() + 1;
|
||||
}
|
||||
|
||||
public Map<Integer, ArrayList<Types.Field>> getParentId2AddCols() {
|
||||
return parentId2AddCols;
|
||||
}
|
||||
|
||||
public Map<Integer, ArrayList<ColumnPositionChange>> getPositionChangeMap() {
|
||||
return positionChangeMap;
|
||||
}
|
||||
|
||||
// expose to test
|
||||
public Map<String, Integer> getFullColName2Id() {
|
||||
return fullColName2Id;
|
||||
}
|
||||
|
||||
protected Integer findIdByFullName(String fullName) {
|
||||
Types.Field field = internalSchema.findField(fullName);
|
||||
if (field != null) {
|
||||
return field.fieldId();
|
||||
}
|
||||
return fullColName2Id.getOrDefault(fullName, -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnChangeID columnChangeId() {
|
||||
return ColumnChangeID.ADD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean withPositionChange() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class to support Table schema changes.
|
||||
*/
|
||||
public class TableChangesHelper {
|
||||
/**
|
||||
* Apply add operation and column position change operation.
|
||||
*
|
||||
* @param fields origin column fields.
|
||||
* @param adds column fields to be added.
|
||||
* @param pchanges a wrapper class hold all the position change operations.
|
||||
* @return column fields after adjusting the position.
|
||||
*/
|
||||
public static List<Types.Field> applyAddChange2Fields(List<Types.Field> fields, ArrayList<Types.Field> adds, ArrayList<TableChange.ColumnPositionChange> pchanges) {
|
||||
if (adds == null && pchanges == null) {
|
||||
return fields;
|
||||
}
|
||||
LinkedList<Types.Field> result = new LinkedList<>(fields);
|
||||
// apply add columns
|
||||
if (adds != null && !adds.isEmpty()) {
|
||||
result.addAll(adds);
|
||||
}
|
||||
// apply position change
|
||||
if (pchanges != null && !pchanges.isEmpty()) {
|
||||
for (TableChange.ColumnPositionChange pchange : pchanges) {
|
||||
Types.Field srcField = result.stream().filter(f -> f.fieldId() == pchange.getSrcId()).findFirst().get();
|
||||
Types.Field dsrField = result.stream().filter(f -> f.fieldId() == pchange.getDsrId()).findFirst().orElse(null);
|
||||
// we remove srcField first
|
||||
result.remove(srcField);
|
||||
switch (pchange.type()) {
|
||||
case AFTER:
|
||||
// add srcField after dsrField
|
||||
result.add(result.indexOf(dsrField) + 1, srcField);
|
||||
break;
|
||||
case BEFORE:
|
||||
// add srcField before dsrField
|
||||
result.add(result.indexOf(dsrField), srcField);
|
||||
break;
|
||||
case FIRST:
|
||||
result.addFirst(srcField);
|
||||
break;
|
||||
default:
|
||||
// should not reach here
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static String getParentName(String fullColName) {
|
||||
int offset = fullColName.lastIndexOf(".");
|
||||
return offset > 0 ? fullColName.substring(0, offset) : "";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,436 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.convert;
|
||||
|
||||
import org.apache.avro.JsonProperties;
|
||||
import org.apache.avro.LogicalType;
|
||||
import org.apache.avro.LogicalTypes;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hudi.internal.schema.HoodieSchemaException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static org.apache.avro.Schema.Type.UNION;
|
||||
|
||||
/**
|
||||
* Auxiliary class.
|
||||
* Converts an avro schema into InternalSchema, or convert InternalSchema to an avro schema
|
||||
*/
|
||||
public class AvroInternalSchemaConverter {
|
||||
|
||||
/**
|
||||
* Convert internalSchema to avro Schema.
|
||||
*
|
||||
* @param internalSchema internal schema.
|
||||
* @param tableName the record name.
|
||||
* @return an avro Schema.
|
||||
*/
|
||||
public static Schema convert(InternalSchema internalSchema, String tableName) {
|
||||
return buildAvroSchemaFromInternalSchema(internalSchema, tableName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert RecordType to avro Schema.
|
||||
*
|
||||
* @param type internal schema.
|
||||
* @param name the record name.
|
||||
* @return an avro Schema.
|
||||
*/
|
||||
public static Schema convert(Types.RecordType type, String name) {
|
||||
return buildAvroSchemaFromType(type, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert internal type to avro Schema.
|
||||
*
|
||||
* @param type internal type.
|
||||
* @param name the record name.
|
||||
* @return an avro Schema.
|
||||
*/
|
||||
public static Schema convert(Type type, String name) {
|
||||
return buildAvroSchemaFromType(type, name);
|
||||
}
|
||||
|
||||
/** Convert an avro schema into internal type. */
|
||||
public static Type convertToField(Schema schema) {
|
||||
return buildTypeFromAvroSchema(schema);
|
||||
}
|
||||
|
||||
/** Convert an avro schema into internalSchema. */
|
||||
public static InternalSchema convert(Schema schema) {
|
||||
List<Types.Field> fields = ((Types.RecordType) convertToField(schema)).fields();
|
||||
return new InternalSchema(fields);
|
||||
}
|
||||
|
||||
/** Check whether current avro schema is optional?. */
|
||||
public static boolean isOptional(Schema schema) {
|
||||
if (schema.getType() == UNION && schema.getTypes().size() == 2) {
|
||||
return schema.getTypes().get(0).getType() == Schema.Type.NULL || schema.getTypes().get(1).getType() == Schema.Type.NULL;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns schema with nullable true. */
|
||||
public static Schema nullableSchema(Schema schema) {
|
||||
if (schema.getType() == UNION) {
|
||||
if (!isOptional(schema)) {
|
||||
throw new HoodieSchemaException(String.format("Union schemas are not supported: %s", schema));
|
||||
}
|
||||
return schema;
|
||||
} else {
|
||||
return Schema.createUnion(Schema.create(Schema.Type.NULL), schema);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build hudi type from avro schema.
|
||||
*
|
||||
* @param schema a avro schema.
|
||||
* @return a hudi type.
|
||||
*/
|
||||
public static Type buildTypeFromAvroSchema(Schema schema) {
|
||||
// set flag to check this has not been visited.
|
||||
Deque<String> visited = new LinkedList();
|
||||
AtomicInteger nextId = new AtomicInteger(1);
|
||||
return visitAvroSchemaToBuildType(schema, visited, true, nextId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an avro schema into hudi type.
|
||||
*
|
||||
* @param schema a avro schema.
|
||||
* @param visited track the visit node when do traversal for avro schema; used to check if the name of avro record schema is correct.
|
||||
* @param firstVisitRoot track whether the current visited schema node is a root node.
|
||||
* @param nextId a initial id which used to create id for all fields.
|
||||
* @return a hudi type match avro schema.
|
||||
*/
|
||||
private static Type visitAvroSchemaToBuildType(Schema schema, Deque<String> visited, Boolean firstVisitRoot, AtomicInteger nextId) {
|
||||
switch (schema.getType()) {
|
||||
case RECORD:
|
||||
String name = schema.getFullName();
|
||||
if (visited.contains(name)) {
|
||||
throw new HoodieSchemaException(String.format("cannot convert recursive avro record %s", name));
|
||||
}
|
||||
visited.push(name);
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
List<Type> fieldTypes = new ArrayList<>(fields.size());
|
||||
int nextAssignId = nextId.get();
|
||||
// when first visit root record, set nextAssignId = 0;
|
||||
if (firstVisitRoot) {
|
||||
nextAssignId = 0;
|
||||
}
|
||||
nextId.set(nextAssignId + fields.size());
|
||||
fields.stream().forEach(field -> {
|
||||
fieldTypes.add(visitAvroSchemaToBuildType(field.schema(), visited, false, nextId));
|
||||
});
|
||||
visited.pop();
|
||||
List<Types.Field> internalFields = new ArrayList<>(fields.size());
|
||||
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
Schema.Field field = fields.get(i);
|
||||
Type fieldType = fieldTypes.get(i);
|
||||
internalFields.add(Types.Field.get(nextAssignId, AvroInternalSchemaConverter.isOptional(field.schema()), field.name(), fieldType, field.doc()));
|
||||
nextAssignId += 1;
|
||||
}
|
||||
return Types.RecordType.get(internalFields);
|
||||
case UNION:
|
||||
List<Type> fTypes = new ArrayList<>();
|
||||
schema.getTypes().stream().forEach(t -> {
|
||||
fTypes.add(visitAvroSchemaToBuildType(t, visited, false, nextId));
|
||||
});
|
||||
return fTypes.get(0) == null ? fTypes.get(1) : fTypes.get(0);
|
||||
case ARRAY:
|
||||
Schema elementSchema = schema.getElementType();
|
||||
int elementId = nextId.get();
|
||||
nextId.set(elementId + 1);
|
||||
Type elementType = visitAvroSchemaToBuildType(elementSchema, visited, false, nextId);
|
||||
return Types.ArrayType.get(elementId, AvroInternalSchemaConverter.isOptional(schema.getElementType()), elementType);
|
||||
case MAP:
|
||||
int keyId = nextId.get();
|
||||
int valueId = keyId + 1;
|
||||
nextId.set(valueId + 1);
|
||||
Type valueType = visitAvroSchemaToBuildType(schema.getValueType(), visited, false, nextId);
|
||||
return Types.MapType.get(keyId, valueId, Types.StringType.get(), valueType, AvroInternalSchemaConverter.isOptional(schema.getValueType()));
|
||||
default:
|
||||
return visitAvroPrimitiveToBuildInternalType(schema);
|
||||
}
|
||||
}
|
||||
|
||||
private static Type visitAvroPrimitiveToBuildInternalType(Schema primitive) {
|
||||
LogicalType logical = primitive.getLogicalType();
|
||||
if (logical != null) {
|
||||
String name = logical.getName();
|
||||
if (logical instanceof LogicalTypes.Decimal) {
|
||||
return Types.DecimalType.get(
|
||||
((LogicalTypes.Decimal) logical).getPrecision(),
|
||||
((LogicalTypes.Decimal) logical).getScale());
|
||||
|
||||
} else if (logical instanceof LogicalTypes.Date) {
|
||||
return Types.DateType.get();
|
||||
|
||||
} else if (
|
||||
logical instanceof LogicalTypes.TimeMillis
|
||||
|| logical instanceof LogicalTypes.TimeMicros) {
|
||||
return Types.TimeType.get();
|
||||
|
||||
} else if (
|
||||
logical instanceof LogicalTypes.TimestampMillis
|
||||
|| logical instanceof LogicalTypes.TimestampMicros) {
|
||||
return Types.TimestampType.get();
|
||||
} else if (LogicalTypes.uuid().getName().equals(name)) {
|
||||
return Types.UUIDType.get();
|
||||
}
|
||||
}
|
||||
|
||||
switch (primitive.getType()) {
|
||||
case BOOLEAN:
|
||||
return Types.BooleanType.get();
|
||||
case INT:
|
||||
return Types.IntType.get();
|
||||
case LONG:
|
||||
return Types.LongType.get();
|
||||
case FLOAT:
|
||||
return Types.FloatType.get();
|
||||
case DOUBLE:
|
||||
return Types.DoubleType.get();
|
||||
case STRING:
|
||||
case ENUM:
|
||||
return Types.StringType.get();
|
||||
case FIXED:
|
||||
return Types.FixedType.getFixed(primitive.getFixedSize());
|
||||
case BYTES:
|
||||
return Types.BinaryType.get();
|
||||
case NULL:
|
||||
return null;
|
||||
default:
|
||||
throw new UnsupportedOperationException("Unsupported primitive type: " + primitive);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi type into an Avro Schema.
|
||||
*
|
||||
* @param type a hudi type.
|
||||
* @param recordName the record name
|
||||
* @return a Avro schema match this type
|
||||
*/
|
||||
public static Schema buildAvroSchemaFromType(Type type, String recordName) {
|
||||
Map<Type, Schema> cache = new HashMap<>();
|
||||
return visitInternalSchemaToBuildAvroSchema(type, cache, recordName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi internal Schema into an Avro Schema.
|
||||
*
|
||||
* @param schema a hudi internal Schema.
|
||||
* @param recordName the record name
|
||||
* @return a Avro schema match hudi internal schema.
|
||||
*/
|
||||
public static Schema buildAvroSchemaFromInternalSchema(InternalSchema schema, String recordName) {
|
||||
Map<Type, Schema> cache = new HashMap<>();
|
||||
return visitInternalSchemaToBuildAvroSchema(schema.getRecord(), cache, recordName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi type into an Avro Schema.
|
||||
*
|
||||
* @param type a hudi type.
|
||||
* @param cache use to cache intermediate convert result to save cost.
|
||||
* @param recordName the record name
|
||||
* @return a Avro schema match this type
|
||||
*/
|
||||
private static Schema visitInternalSchemaToBuildAvroSchema(Type type, Map<Type, Schema> cache, String recordName) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Schema> schemas = new ArrayList<>();
|
||||
record.fields().forEach(f -> {
|
||||
Schema tempSchema = visitInternalSchemaToBuildAvroSchema(f.type(), cache, recordName + "_" + f.name());
|
||||
// convert tempSchema
|
||||
Schema result = f.isOptional() ? AvroInternalSchemaConverter.nullableSchema(tempSchema) : tempSchema;
|
||||
schemas.add(result);
|
||||
});
|
||||
// check visited
|
||||
Schema recordSchema;
|
||||
recordSchema = cache.get(record);
|
||||
if (recordSchema != null) {
|
||||
return recordSchema;
|
||||
}
|
||||
recordSchema = visitInternalRecordToBuildAvroRecord(record, schemas, recordName);
|
||||
cache.put(record, recordSchema);
|
||||
return recordSchema;
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Schema elementSchema;
|
||||
elementSchema = visitInternalSchemaToBuildAvroSchema(array.elementType(), cache, recordName);
|
||||
Schema arraySchema;
|
||||
arraySchema = cache.get(array);
|
||||
if (arraySchema != null) {
|
||||
return arraySchema;
|
||||
}
|
||||
arraySchema = visitInternalArrayToBuildAvroArray(array, elementSchema);
|
||||
cache.put(array, arraySchema);
|
||||
return arraySchema;
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
Schema keySchema;
|
||||
Schema valueSchema;
|
||||
keySchema = visitInternalSchemaToBuildAvroSchema(map.keyType(), cache, recordName);
|
||||
valueSchema = visitInternalSchemaToBuildAvroSchema(map.valueType(), cache, recordName);
|
||||
Schema mapSchema;
|
||||
mapSchema = cache.get(map);
|
||||
if (mapSchema != null) {
|
||||
return mapSchema;
|
||||
}
|
||||
mapSchema = visitInternalMapToBuildAvroMap(map, keySchema, valueSchema);
|
||||
cache.put(map, mapSchema);
|
||||
return mapSchema;
|
||||
default:
|
||||
Schema primitiveSchema = visitInternalPrimitiveToBuildAvroPrimitiveType((Type.PrimitiveType) type);
|
||||
cache.put(type, primitiveSchema);
|
||||
return primitiveSchema;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi RecordType to Avro RecordType.
|
||||
* this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
|
||||
*/
|
||||
private static Schema visitInternalRecordToBuildAvroRecord(Types.RecordType record, List<Schema> fieldSchemas, String recordName) {
|
||||
List<Types.Field> fields = record.fields();
|
||||
List<Schema.Field> avroFields = new ArrayList<>();
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
Types.Field f = fields.get(i);
|
||||
Schema.Field field = new Schema.Field(f.name(), fieldSchemas.get(i), f.doc(), f.isOptional() ? JsonProperties.NULL_VALUE : null);
|
||||
avroFields.add(field);
|
||||
}
|
||||
return Schema.createRecord(recordName, null, null, false, avroFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi ArrayType to Avro ArrayType.
|
||||
* this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
|
||||
*/
|
||||
private static Schema visitInternalArrayToBuildAvroArray(Types.ArrayType array, Schema elementSchema) {
|
||||
Schema result;
|
||||
if (array.isElementOptional()) {
|
||||
result = Schema.createArray(AvroInternalSchemaConverter.nullableSchema(elementSchema));
|
||||
} else {
|
||||
result = Schema.createArray(elementSchema);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi MapType to Avro MapType.
|
||||
* this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
|
||||
*/
|
||||
private static Schema visitInternalMapToBuildAvroMap(Types.MapType map, Schema keySchema, Schema valueSchema) {
|
||||
Schema mapSchema;
|
||||
if (keySchema.getType() == Schema.Type.STRING) {
|
||||
mapSchema = Schema.createMap(map.isValueOptional() ? AvroInternalSchemaConverter.nullableSchema(valueSchema) : valueSchema);
|
||||
} else {
|
||||
throw new HoodieSchemaException("only support StringType key for avro MapType");
|
||||
}
|
||||
return mapSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts hudi PrimitiveType to Avro PrimitiveType.
|
||||
* this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
|
||||
*/
|
||||
private static Schema visitInternalPrimitiveToBuildAvroPrimitiveType(Type.PrimitiveType primitive) {
|
||||
Schema primitiveSchema;
|
||||
switch (primitive.typeId()) {
|
||||
case BOOLEAN:
|
||||
primitiveSchema = Schema.create(Schema.Type.BOOLEAN);
|
||||
break;
|
||||
case INT:
|
||||
primitiveSchema = Schema.create(Schema.Type.INT);
|
||||
break;
|
||||
case LONG:
|
||||
primitiveSchema = Schema.create(Schema.Type.LONG);
|
||||
break;
|
||||
case FLOAT:
|
||||
primitiveSchema = Schema.create(Schema.Type.FLOAT);
|
||||
break;
|
||||
case DOUBLE:
|
||||
primitiveSchema = Schema.create(Schema.Type.DOUBLE);
|
||||
break;
|
||||
case DATE:
|
||||
primitiveSchema = LogicalTypes.date()
|
||||
.addToSchema(Schema.create(Schema.Type.INT));
|
||||
break;
|
||||
case TIME:
|
||||
primitiveSchema = LogicalTypes.timeMicros()
|
||||
.addToSchema(Schema.create(Schema.Type.LONG));
|
||||
break;
|
||||
case TIMESTAMP:
|
||||
primitiveSchema = LogicalTypes.timestampMicros()
|
||||
.addToSchema(Schema.create(Schema.Type.LONG));
|
||||
break;
|
||||
case STRING:
|
||||
primitiveSchema = Schema.create(Schema.Type.STRING);
|
||||
break;
|
||||
case UUID:
|
||||
primitiveSchema = LogicalTypes.uuid()
|
||||
.addToSchema(Schema.createFixed("uuid_fixed", null, null, 16));
|
||||
break;
|
||||
case FIXED:
|
||||
Types.FixedType fixed = (Types.FixedType) primitive;
|
||||
primitiveSchema = Schema.createFixed("fixed_" + fixed.getFixedSize(), null, null, fixed.getFixedSize());
|
||||
break;
|
||||
case BINARY:
|
||||
primitiveSchema = Schema.create(Schema.Type.BYTES);
|
||||
break;
|
||||
case DECIMAL:
|
||||
Types.DecimalType decimal = (Types.DecimalType) primitive;
|
||||
primitiveSchema = LogicalTypes.decimal(decimal.precision(), decimal.scale())
|
||||
.addToSchema(Schema.createFixed(
|
||||
"decimal_" + decimal.precision() + "_" + decimal.scale(),
|
||||
null, null, computeMinBytesForPrecision(decimal.precision())));
|
||||
break;
|
||||
default:
|
||||
throw new UnsupportedOperationException(
|
||||
"Unsupported type ID: " + primitive.typeId());
|
||||
}
|
||||
return primitiveSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the minimum number of bytes needed to store a decimal with a give 'precision'.
|
||||
* reference from Spark release 3.1 .
|
||||
*/
|
||||
private static int computeMinBytesForPrecision(int precision) {
|
||||
int numBytes = 1;
|
||||
while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) {
|
||||
numBytes += 1;
|
||||
}
|
||||
return numBytes;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.io;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
abstract class AbstractInternalSchemaStorageManager {
|
||||
|
||||
/**
|
||||
* Persist history schema str.
|
||||
*/
|
||||
public abstract void persistHistorySchemaStr(String instantTime, String historySchemaStr);
|
||||
|
||||
/**
|
||||
* Get latest history schema string.
|
||||
*/
|
||||
public abstract String getHistorySchemaStr();
|
||||
|
||||
/**
|
||||
* Get latest history schema string.
|
||||
* Using give validCommits to validate all legal histroy Schema files, and return the latest one.
|
||||
* If the passed valid commits is null or empty, valid instants will be fetched from the file-system and used.
|
||||
*/
|
||||
public abstract String getHistorySchemaStrByGivenValidCommits(List<String> validCommits);
|
||||
|
||||
/**
|
||||
* Get internalSchema by using given versionId
|
||||
*
|
||||
* @param versionId schema version_id need to search
|
||||
* @return internalSchema
|
||||
*/
|
||||
public abstract Option getSchemaByKey(String versionId);
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.io;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
|
||||
import org.apache.hudi.internal.schema.utils.SerDeHelper;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.common.table.timeline.HoodieTimeline.SCHEMA_COMMIT_ACTION;
|
||||
|
||||
public class FileBasedInternalSchemaStorageManager extends AbstractInternalSchemaStorageManager {
|
||||
private static final Logger LOG = LogManager.getLogger(FileBasedInternalSchemaStorageManager.class);
|
||||
|
||||
public static final String SCHEMA_NAME = ".schema";
|
||||
private final Path baseSchemaPath;
|
||||
private final Configuration conf;
|
||||
private HoodieTableMetaClient metaClient;
|
||||
|
||||
public FileBasedInternalSchemaStorageManager(Configuration conf, Path baseTablePath) {
|
||||
Path metaPath = new Path(baseTablePath, ".hoodie");
|
||||
this.baseSchemaPath = new Path(metaPath, SCHEMA_NAME);
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
public FileBasedInternalSchemaStorageManager(HoodieTableMetaClient metaClient) {
|
||||
Path metaPath = new Path(metaClient.getBasePath(), ".hoodie");
|
||||
this.baseSchemaPath = new Path(metaPath, SCHEMA_NAME);
|
||||
this.conf = metaClient.getHadoopConf();
|
||||
this.metaClient = metaClient;
|
||||
}
|
||||
|
||||
// make metaClient build lazy
|
||||
private HoodieTableMetaClient getMetaClient() {
|
||||
if (metaClient == null) {
|
||||
metaClient = HoodieTableMetaClient.builder().setBasePath(baseSchemaPath.getParent().getParent().toString()).setConf(conf).build();
|
||||
}
|
||||
return metaClient;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void persistHistorySchemaStr(String instantTime, String historySchemaStr) {
|
||||
cleanResidualFiles();
|
||||
HoodieActiveTimeline timeline = getMetaClient().getActiveTimeline();
|
||||
HoodieInstant hoodieInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, SCHEMA_COMMIT_ACTION, instantTime);
|
||||
timeline.createNewInstant(hoodieInstant);
|
||||
byte[] writeContent = historySchemaStr.getBytes(StandardCharsets.UTF_8);
|
||||
timeline.transitionRequestedToInflight(hoodieInstant, Option.empty());
|
||||
timeline.saveAsComplete(new HoodieInstant(HoodieInstant.State.INFLIGHT, hoodieInstant.getAction(), hoodieInstant.getTimestamp()), Option.of(writeContent));
|
||||
LOG.info(String.format("persist history schema success on commit time: %s", instantTime));
|
||||
}
|
||||
|
||||
private void cleanResidualFiles() {
|
||||
List<String> validateCommits = getValidInstants();
|
||||
try {
|
||||
FileSystem fs = baseSchemaPath.getFileSystem(conf);
|
||||
if (fs.exists(baseSchemaPath)) {
|
||||
List<String> candidateSchemaFiles = Arrays.stream(fs.listStatus(baseSchemaPath)).filter(f -> f.isFile())
|
||||
.map(file -> file.getPath().getName()).collect(Collectors.toList());
|
||||
List<String> residualSchemaFiles = candidateSchemaFiles.stream().filter(f -> !validateCommits.contains(f.split("\\.")[0])).collect(Collectors.toList());
|
||||
// clean residual files
|
||||
residualSchemaFiles.forEach(f -> {
|
||||
try {
|
||||
fs.delete(new Path(getMetaClient().getSchemaFolderName(), f));
|
||||
} catch (IOException o) {
|
||||
throw new HoodieException(o);
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void cleanOldFiles(List<String> validateCommits) {
|
||||
try {
|
||||
FileSystem fs = baseSchemaPath.getFileSystem(conf);
|
||||
if (fs.exists(baseSchemaPath)) {
|
||||
List<String> candidateSchemaFiles = Arrays.stream(fs.listStatus(baseSchemaPath)).filter(f -> f.isFile())
|
||||
.map(file -> file.getPath().getName()).collect(Collectors.toList());
|
||||
List<String> validateSchemaFiles = candidateSchemaFiles.stream().filter(f -> validateCommits.contains(f.split("\\.")[0])).collect(Collectors.toList());
|
||||
for (int i = 0; i < validateSchemaFiles.size(); i++) {
|
||||
fs.delete(new Path(validateSchemaFiles.get(i)));
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getValidInstants() {
|
||||
return getMetaClient().getCommitsTimeline()
|
||||
.filterCompletedInstants().getInstants().map(f -> f.getTimestamp()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHistorySchemaStr() {
|
||||
return getHistorySchemaStrByGivenValidCommits(Collections.EMPTY_LIST);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHistorySchemaStrByGivenValidCommits(List<String> validCommits) {
|
||||
List<String> commitList = validCommits == null || validCommits.isEmpty() ? getValidInstants() : validCommits;
|
||||
try {
|
||||
FileSystem fs = FSUtils.getFs(baseSchemaPath.toString(), conf);
|
||||
if (fs.exists(baseSchemaPath)) {
|
||||
List<String> validaSchemaFiles = Arrays.stream(fs.listStatus(baseSchemaPath))
|
||||
.filter(f -> f.isFile() && f.getPath().getName().endsWith(SCHEMA_COMMIT_ACTION))
|
||||
.map(file -> file.getPath().getName()).filter(f -> commitList.contains(f.split("\\.")[0])).sorted().collect(Collectors.toList());
|
||||
if (!validaSchemaFiles.isEmpty()) {
|
||||
Path latestFilePath = new Path(baseSchemaPath, validaSchemaFiles.get(validaSchemaFiles.size() - 1));
|
||||
byte[] content;
|
||||
try (FSDataInputStream is = fs.open(latestFilePath)) {
|
||||
content = FileIOUtils.readAsByteArray(is);
|
||||
LOG.info(String.format("read history schema success from file : %s", latestFilePath));
|
||||
return new String(content, StandardCharsets.UTF_8);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Could not read history schema from " + latestFilePath, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException io) {
|
||||
throw new HoodieException(io);
|
||||
}
|
||||
LOG.info("failed to read history schema");
|
||||
return "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<InternalSchema> getSchemaByKey(String versionId) {
|
||||
String historySchemaStr = getHistorySchemaStr();
|
||||
TreeMap<Long, InternalSchema> treeMap;
|
||||
if (historySchemaStr.isEmpty()) {
|
||||
return Option.empty();
|
||||
} else {
|
||||
treeMap = SerDeHelper.parseSchemas(historySchemaStr);
|
||||
InternalSchema result = InternalSchemaUtils.searchSchema(Long.valueOf(versionId), treeMap);
|
||||
if (result == null) {
|
||||
return Option.empty();
|
||||
}
|
||||
return Option.of(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.utils;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.action.TableChanges;
|
||||
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Utility methods to support evolve old avro schema based on a given schema.
|
||||
*/
|
||||
public class AvroSchemaEvolutionUtils {
|
||||
/**
|
||||
* Support evolution from a new avroSchema.
|
||||
* Now hoodie support implicitly add columns when hoodie write operation,
|
||||
* This ability needs to be preserved, so implicitly evolution for internalSchema should supported.
|
||||
*
|
||||
* @param evolvedSchema implicitly evolution of avro when hoodie write operation
|
||||
* @param oldSchema old internalSchema
|
||||
* @param supportPositionReorder support position reorder
|
||||
* @return evolution Schema
|
||||
*/
|
||||
public static InternalSchema evolveSchemaFromNewAvroSchema(Schema evolvedSchema, InternalSchema oldSchema, Boolean supportPositionReorder) {
|
||||
InternalSchema evolvedInternalSchema = AvroInternalSchemaConverter.convert(evolvedSchema);
|
||||
// do check, only support add column evolution
|
||||
List<String> colNamesFromEvolved = evolvedInternalSchema.getAllColsFullName();
|
||||
List<String> colNamesFromOldSchema = oldSchema.getAllColsFullName();
|
||||
List<String> diffFromOldSchema = colNamesFromOldSchema.stream().filter(f -> !colNamesFromEvolved.contains(f)).collect(Collectors.toList());
|
||||
List<Types.Field> newFields = new ArrayList<>();
|
||||
if (colNamesFromEvolved.size() == colNamesFromOldSchema.size() && diffFromOldSchema.size() == 0) {
|
||||
// no changes happen
|
||||
if (supportPositionReorder) {
|
||||
evolvedInternalSchema.getRecord().fields().forEach(f -> newFields.add(oldSchema.getRecord().field(f.name())));
|
||||
return new InternalSchema(newFields);
|
||||
}
|
||||
return oldSchema;
|
||||
}
|
||||
// try to find all added columns
|
||||
if (diffFromOldSchema.size() != 0) {
|
||||
throw new UnsupportedOperationException("Cannot evolve schema implicitly, find delete/rename operation");
|
||||
}
|
||||
|
||||
List<String> diffFromEvolutionSchema = colNamesFromEvolved.stream().filter(f -> !colNamesFromOldSchema.contains(f)).collect(Collectors.toList());
|
||||
// Remove redundancy from diffFromEvolutionSchema.
|
||||
// for example, now we add a struct col in evolvedSchema, the struct col is " user struct<name:string, age:int> "
|
||||
// when we do diff operation: user, user.name, user.age will appeared in the resultSet which is redundancy, user.name and user.age should be excluded.
|
||||
// deal with add operation
|
||||
TreeMap<Integer, String> finalAddAction = new TreeMap<>();
|
||||
for (int i = 0; i < diffFromEvolutionSchema.size(); i++) {
|
||||
String name = diffFromEvolutionSchema.get(i);
|
||||
int splitPoint = name.lastIndexOf(".");
|
||||
String parentName = splitPoint > 0 ? name.substring(0, splitPoint) : "";
|
||||
if (!parentName.isEmpty() && diffFromEvolutionSchema.contains(parentName)) {
|
||||
// find redundancy, skip it
|
||||
continue;
|
||||
}
|
||||
finalAddAction.put(evolvedInternalSchema.findIdByName(name), name);
|
||||
}
|
||||
|
||||
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
|
||||
finalAddAction.entrySet().stream().forEach(f -> {
|
||||
String name = f.getValue();
|
||||
int splitPoint = name.lastIndexOf(".");
|
||||
String parentName = splitPoint > 0 ? name.substring(0, splitPoint) : "";
|
||||
String rawName = splitPoint > 0 ? name.substring(splitPoint + 1) : name;
|
||||
addChange.addColumns(parentName, rawName, evolvedInternalSchema.findType(name), null);
|
||||
});
|
||||
|
||||
InternalSchema res = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
|
||||
if (supportPositionReorder) {
|
||||
evolvedInternalSchema.getRecord().fields().forEach(f -> newFields.add(oldSchema.getRecord().field(f.name())));
|
||||
return new InternalSchema(newFields);
|
||||
} else {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
public static InternalSchema evolveSchemaFromNewAvroSchema(Schema evolvedSchema, InternalSchema oldSchema) {
|
||||
return evolveSchemaFromNewAvroSchema(evolvedSchema, oldSchema, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Canonical the nullability.
|
||||
* Do not allow change cols Nullability field from optional to required.
|
||||
* If above problem occurs, try to correct it.
|
||||
*
|
||||
* @param writeSchema writeSchema hoodie used to write data.
|
||||
* @param readSchema read schema
|
||||
* @return canonical Schema
|
||||
*/
|
||||
public static Schema canonicalizeColumnNullability(Schema writeSchema, Schema readSchema) {
|
||||
if (writeSchema.getFields().isEmpty() || readSchema.getFields().isEmpty()) {
|
||||
return writeSchema;
|
||||
}
|
||||
InternalSchema writeInternalSchema = AvroInternalSchemaConverter.convert(writeSchema);
|
||||
InternalSchema readInternalSchema = AvroInternalSchemaConverter.convert(readSchema);
|
||||
List<String> colNamesWriteSchema = writeInternalSchema.getAllColsFullName();
|
||||
List<String> colNamesFromReadSchema = readInternalSchema.getAllColsFullName();
|
||||
// try to deal with optional change. now when we use sparksql to update hudi table,
|
||||
// sparksql Will change the col type from optional to required, this is a bug.
|
||||
List<String> candidateUpdateCols = colNamesWriteSchema.stream().filter(f -> {
|
||||
boolean exist = colNamesFromReadSchema.contains(f);
|
||||
if (exist && (writeInternalSchema.findField(f).isOptional() != readInternalSchema.findField(f).isOptional())) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}).collect(Collectors.toList());
|
||||
if (candidateUpdateCols.isEmpty()) {
|
||||
return writeSchema;
|
||||
}
|
||||
// try to correct all changes
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(writeInternalSchema);
|
||||
candidateUpdateCols.stream().forEach(f -> updateChange.updateColumnNullability(f, true));
|
||||
Schema result = AvroInternalSchemaConverter.convert(SchemaChangeUtils.applyTableChanges2Schema(writeInternalSchema, updateChange), writeSchema.getName());
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,270 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.utils;
|
||||
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.internal.schema.HoodieSchemaException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.Types.Field;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.SortedMap;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Util methods to help us do some operations on InternalSchema.
|
||||
* eg: column prune, filter rebuild for query engine...
|
||||
*/
|
||||
public class InternalSchemaUtils {
|
||||
|
||||
private InternalSchemaUtils() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Create project internalSchema, based on the project names which produced by query engine.
|
||||
* support nested project.
|
||||
*
|
||||
* @param schema a internal schema.
|
||||
* @param names project names produced by query engine.
|
||||
* @return a project internalSchema.
|
||||
*/
|
||||
public static InternalSchema pruneInternalSchema(InternalSchema schema, List<String> names) {
|
||||
// do check
|
||||
List<Integer> prunedIds = names.stream().map(name -> {
|
||||
int id = schema.findIdByName(name);
|
||||
if (id == -1) {
|
||||
throw new IllegalArgumentException(String.format("cannot prune col: %s which not exisit in hudi table", name));
|
||||
}
|
||||
return id;
|
||||
}).collect(Collectors.toList());
|
||||
// find top parent field ID. eg: a.b.c, f.g.h, only collect id of a and f ignore all child field.
|
||||
List<Integer> topParentFieldIds = new ArrayList<>();
|
||||
names.stream().forEach(f -> {
|
||||
int id = schema.findIdByName(f.split("\\.")[0]);
|
||||
if (!topParentFieldIds.contains(id)) {
|
||||
topParentFieldIds.add(id);
|
||||
}
|
||||
});
|
||||
return pruneInternalSchemaByID(schema, prunedIds, topParentFieldIds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create project internalSchema.
|
||||
* support nested project.
|
||||
*
|
||||
* @param schema a internal schema.
|
||||
* @param fieldIds project col field_ids.
|
||||
* @return a project internalSchema.
|
||||
*/
|
||||
public static InternalSchema pruneInternalSchemaByID(InternalSchema schema, List<Integer> fieldIds, List<Integer> topParentFieldIds) {
|
||||
Types.RecordType recordType = (Types.RecordType)pruneType(schema.getRecord(), fieldIds);
|
||||
// reorder top parent fields, since the recordType.fields() produced by pruneType maybe out of order.
|
||||
List<Types.Field> newFields = new ArrayList<>();
|
||||
if (topParentFieldIds != null && !topParentFieldIds.isEmpty()) {
|
||||
for (int id : topParentFieldIds) {
|
||||
Types.Field f = recordType.field(id);
|
||||
if (f != null) {
|
||||
newFields.add(f);
|
||||
} else {
|
||||
throw new HoodieSchemaException(String.format("cannot find pruned id %s in currentSchema %s", id, schema.toString()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return new InternalSchema(newFields.isEmpty() ? recordType.fields() : newFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Project hudi type by projected cols field_ids
|
||||
* this is auxiliary function used by pruneInternalSchema.
|
||||
*/
|
||||
private static Type pruneType(Type type, List<Integer> fieldIds) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Types.Field> fields = record.fields();
|
||||
List<Type> newTypes = new ArrayList<>();
|
||||
for (Types.Field f : fields) {
|
||||
Type newType = pruneType(f.type(), fieldIds);
|
||||
if (fieldIds.contains(f.fieldId())) {
|
||||
newTypes.add(f.type());
|
||||
} else if (newType != null) {
|
||||
newTypes.add(newType);
|
||||
} else {
|
||||
newTypes.add(null);
|
||||
}
|
||||
}
|
||||
boolean changed = false;
|
||||
List<Field> newFields = new ArrayList<>();
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
Types.Field oldField = fields.get(i);
|
||||
Type newType = newTypes.get(i);
|
||||
if (oldField.type() == newType) {
|
||||
newFields.add(oldField);
|
||||
} else if (newType != null) {
|
||||
changed = true;
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
|
||||
}
|
||||
}
|
||||
if (newFields.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (newFields.size() == fields.size() && !changed) {
|
||||
return record;
|
||||
} else {
|
||||
return Types.RecordType.get(newFields);
|
||||
}
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Type newElementType = pruneType(array.elementType(), fieldIds);
|
||||
if (fieldIds.contains(array.elementId())) {
|
||||
return array;
|
||||
} else if (newElementType != null) {
|
||||
if (array.elementType() == newElementType) {
|
||||
return array;
|
||||
}
|
||||
return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType);
|
||||
}
|
||||
return null;
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
Type newValueType = pruneType(map.valueType(), fieldIds);
|
||||
if (fieldIds.contains(map.valueId())) {
|
||||
return map;
|
||||
} else if (newValueType != null) {
|
||||
if (map.valueType() == newValueType) {
|
||||
return map;
|
||||
}
|
||||
return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional());
|
||||
}
|
||||
return null;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper function to help correct the colName of pushed filters.
|
||||
*
|
||||
* @param name origin col name from pushed filters.
|
||||
* @param fileSchema the real schema of avro/parquet file.
|
||||
* @param querySchema the query schema which query engine produced.
|
||||
* @return a corrected name.
|
||||
*/
|
||||
public static String reBuildFilterName(String name, InternalSchema fileSchema, InternalSchema querySchema) {
|
||||
int nameId = querySchema.findIdByName(name);
|
||||
if (nameId == -1) {
|
||||
throw new IllegalArgumentException(String.format("cannot found filter col name:%s from querySchema: %s", name, querySchema));
|
||||
}
|
||||
if (fileSchema.findField(nameId) == null) {
|
||||
// added operation found
|
||||
// the read file does not contain current col, so current colFilter is invalid
|
||||
return "";
|
||||
} else {
|
||||
if (name.equals(fileSchema.findfullName(nameId))) {
|
||||
// no change happened on current col
|
||||
return name;
|
||||
} else {
|
||||
// find rename operation on current col
|
||||
// return the name from fileSchema
|
||||
return fileSchema.findfullName(nameId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect all type changed cols to build a colPosition -> (newColType, oldColType) map.
|
||||
* only collect top level col changed. eg: a is a nest field(record(b int, d long), now a.b is changed from int to long,
|
||||
* only a will be collected, a.b will excluded.
|
||||
*
|
||||
* @param schema a type changed internalSchema
|
||||
* @param oldSchema an old internalSchema.
|
||||
* @return a map.
|
||||
*/
|
||||
public static Map<Integer, Pair<Type, Type>> collectTypeChangedCols(InternalSchema schema, InternalSchema oldSchema) {
|
||||
Set<Integer> ids = schema.getAllIds();
|
||||
Set<Integer> otherIds = oldSchema.getAllIds();
|
||||
Map<Integer, Pair<Type, Type>> result = new HashMap<>();
|
||||
ids.stream().filter(f -> otherIds.contains(f)).forEach(f -> {
|
||||
if (!schema.findType(f).equals(oldSchema.findType(f))) {
|
||||
String[] fieldNameParts = schema.findfullName(f).split("\\.");
|
||||
String[] otherFieldNameParts = oldSchema.findfullName(f).split("\\.");
|
||||
String parentName = fieldNameParts[0];
|
||||
String otherParentName = otherFieldNameParts[0];
|
||||
if (fieldNameParts.length == otherFieldNameParts.length && schema.findIdByName(parentName) == oldSchema.findIdByName(otherParentName)) {
|
||||
int index = schema.findIdByName(parentName);
|
||||
int position = schema.getRecord().fields().stream().map(s -> s.fieldId()).collect(Collectors.toList()).indexOf(index);
|
||||
if (!result.containsKey(position)) {
|
||||
result.put(position, Pair.of(schema.findType(parentName), oldSchema.findType(otherParentName)));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search target internalSchema by version number.
|
||||
*
|
||||
* @param versionId the internalSchema version to be search.
|
||||
* @param internalSchemas internalSchemas to be searched.
|
||||
* @return a internalSchema.
|
||||
*/
|
||||
public static InternalSchema searchSchema(long versionId, List<InternalSchema> internalSchemas) {
|
||||
TreeMap<Long, InternalSchema> treeMap = new TreeMap<>();
|
||||
internalSchemas.forEach(s -> treeMap.put(s.schemaId(), s));
|
||||
return searchSchema(versionId, treeMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Search target internalSchema by version number.
|
||||
*
|
||||
* @param versionId the internalSchema version to be search.
|
||||
* @param treeMap internalSchemas collections to be searched.
|
||||
* @return a internalSchema.
|
||||
*/
|
||||
public static InternalSchema searchSchema(long versionId, TreeMap<Long, InternalSchema> treeMap) {
|
||||
if (treeMap.containsKey(versionId)) {
|
||||
return treeMap.get(versionId);
|
||||
} else {
|
||||
SortedMap<Long, InternalSchema> headMap = treeMap.headMap(versionId);
|
||||
if (!headMap.isEmpty()) {
|
||||
return headMap.get(headMap.lastKey());
|
||||
}
|
||||
}
|
||||
return InternalSchema.getEmptyInternalSchema();
|
||||
}
|
||||
|
||||
public static String createFullName(String name, Deque<String> fieldNames) {
|
||||
String result = name;
|
||||
if (!fieldNames.isEmpty()) {
|
||||
List<String> parentNames = new ArrayList<>();
|
||||
fieldNames.descendingIterator().forEachRemaining(parentNames::add);
|
||||
result = parentNames.stream().collect(Collectors.joining(".")) + "." + result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,305 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.utils;
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.action.TableChanges;
|
||||
import org.apache.hudi.internal.schema.action.TableChangesHelper;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper methods for schema Change.
|
||||
*/
|
||||
public class SchemaChangeUtils {
|
||||
private SchemaChangeUtils() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to allow the column type to be updated.
|
||||
* now only support:
|
||||
* int => long/float/double/string
|
||||
* long => float/double/string
|
||||
* float => double/String
|
||||
* double => String/Decimal
|
||||
* Decimal => Decimal/String
|
||||
* String => date/decimal
|
||||
* date => String
|
||||
* TODO: support more type update.
|
||||
*
|
||||
* @param src origin column type.
|
||||
* @param dsr new column type.
|
||||
* @return whether to allow the column type to be updated.
|
||||
*/
|
||||
public static boolean isTypeUpdateAllow(Type src, Type dsr) {
|
||||
if (src.isNestedType() || dsr.isNestedType()) {
|
||||
throw new IllegalArgumentException("only support update primitive type");
|
||||
}
|
||||
if (src.equals(dsr)) {
|
||||
return true;
|
||||
}
|
||||
switch (src.typeId()) {
|
||||
case INT:
|
||||
return dsr == Types.LongType.get() || dsr == Types.FloatType.get()
|
||||
|| dsr == Types.DoubleType.get() || dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL;
|
||||
case LONG:
|
||||
return dsr == Types.FloatType.get() || dsr == Types.DoubleType.get() || dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL;
|
||||
case FLOAT:
|
||||
return dsr == Types.DoubleType.get() || dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL;
|
||||
case DOUBLE:
|
||||
return dsr == Types.StringType.get() || dsr.typeId() == Type.TypeID.DECIMAL;
|
||||
case DATE:
|
||||
return dsr == Types.StringType.get();
|
||||
case DECIMAL:
|
||||
if (dsr.typeId() == Type.TypeID.DECIMAL) {
|
||||
Types.DecimalType decimalSrc = (Types.DecimalType)src;
|
||||
Types.DecimalType decimalDsr = (Types.DecimalType)dsr;
|
||||
if (decimalDsr.isWiderThan(decimalSrc)) {
|
||||
return true;
|
||||
}
|
||||
} else if (dsr.typeId() == Type.TypeID.STRING) {
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
case STRING:
|
||||
return dsr == Types.DateType.get() || dsr.typeId() == Type.TypeID.DECIMAL;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply all the DDL add operations to internalSchema to produce a new internalSchema.
|
||||
*
|
||||
* @param internalSchema origin internalSchema.
|
||||
* @param adds a wrapper class for all the DDL add operations.
|
||||
* @return a new internalSchema.
|
||||
*/
|
||||
public static InternalSchema applyTableChanges2Schema(InternalSchema internalSchema, TableChanges.ColumnAddChange adds) {
|
||||
Types.RecordType newType = (Types.RecordType)applyTableChange2Type(internalSchema.getRecord(), adds);
|
||||
// deal with root level changes
|
||||
List<Types.Field> newFields = TableChangesHelper.applyAddChange2Fields(newType.fields(),
|
||||
adds.getParentId2AddCols().get(-1), adds.getPositionChangeMap().get(-1));
|
||||
return new InternalSchema(newFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply all the DDL add operations to Type to produce a new internalSchema.
|
||||
* do not call this method directly. expose this method only for UT.
|
||||
*
|
||||
* @param type origin hudi Type.
|
||||
* @param adds a wrapper class for all the DDL add operations.
|
||||
* @return a new internalSchema.
|
||||
*/
|
||||
public static Type applyTableChange2Type(Type type, TableChanges.ColumnAddChange adds) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Type> newTypes = new ArrayList<>();
|
||||
for (Types.Field f : record.fields()) {
|
||||
Type newType = applyTableChange2Type(f.type(), adds);
|
||||
// try to apply add
|
||||
newTypes.add(newType.isNestedType() ? adds.applyAdd(f, newType) : newType);
|
||||
}
|
||||
List<Types.Field> newFields = new ArrayList<>();
|
||||
boolean hasChanged = false;
|
||||
for (int i = 0; i < newTypes.size(); i++) {
|
||||
Type newType = newTypes.get(i);
|
||||
Types.Field oldfield = record.fields().get(i);
|
||||
if (oldfield.type() == newType) {
|
||||
newFields.add(oldfield);
|
||||
} else {
|
||||
hasChanged = true;
|
||||
newFields.add(Types.Field.get(oldfield.fieldId(), oldfield.isOptional(), oldfield.name(), newType, oldfield.doc()));
|
||||
}
|
||||
}
|
||||
return hasChanged ? Types.RecordType.get(newFields) : record;
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Type newElementType;
|
||||
Types.Field elementField = array.field(array.elementId());
|
||||
newElementType = applyTableChange2Type(array.elementType(), adds);
|
||||
// try to apply add
|
||||
newElementType = adds.applyAdd(elementField, newElementType);
|
||||
if (newElementType == array.elementType()) {
|
||||
return array;
|
||||
}
|
||||
return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType);
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
Type newValueType;
|
||||
Types.Field valueField = map.field(map.valueId());
|
||||
if (adds.getParentId2AddCols().containsKey(map.keyId())) {
|
||||
throw new IllegalArgumentException("Cannot add fields to map keys: " + map);
|
||||
}
|
||||
newValueType = applyTableChange2Type(map.valueType(), adds);
|
||||
// try to apply add
|
||||
newValueType = adds.applyAdd(valueField, newValueType);
|
||||
if (newValueType == map.valueType()) {
|
||||
return map;
|
||||
}
|
||||
return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional());
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply all the DDL delete operations to internalSchema to produce a new internalSchema.
|
||||
*
|
||||
* @param internalSchema origin internalSchema.
|
||||
* @param deletes a wrapper class for all the DDL delete operations.
|
||||
* @return a new internalSchema.
|
||||
*/
|
||||
public static InternalSchema applyTableChanges2Schema(InternalSchema internalSchema, TableChanges.ColumnDeleteChange deletes) {
|
||||
Types.RecordType newType = (Types.RecordType)applyTableChange2Type(internalSchema.getRecord(), deletes);
|
||||
return new InternalSchema(newType.fields());
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply all the DDL delete operations to Type to produce a new internalSchema.
|
||||
* do not call this method directly. expose this method only for UT.
|
||||
*
|
||||
* @param type origin type.
|
||||
* @param deletes a wrapper class for all the DDL delete operations.
|
||||
* @return a new internalSchema.
|
||||
*/
|
||||
private static Type applyTableChange2Type(Type type, TableChanges.ColumnDeleteChange deletes) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Types.Field> fields = new ArrayList<>();
|
||||
for (Types.Field f : record.fields()) {
|
||||
Type newType = applyTableChange2Type(f.type(), deletes);
|
||||
// apply delete
|
||||
newType = deletes.applyDelete(f.fieldId(), newType);
|
||||
if (newType != null) {
|
||||
fields.add(Types.Field.get(f.fieldId(), f.isOptional(), f.name(), newType, f.doc()));
|
||||
}
|
||||
}
|
||||
if (fields.isEmpty()) {
|
||||
throw new UnsupportedOperationException("cannot support delete all columns from Struct");
|
||||
}
|
||||
return Types.RecordType.get(fields);
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Type newElementType = applyTableChange2Type(array.elementType(), deletes);
|
||||
newElementType = deletes.applyDelete(array.elementId(), newElementType);
|
||||
if (newElementType == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot delete element from arrayType: %s", array));
|
||||
}
|
||||
return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType);
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
int keyId = map.fields().get(0).fieldId();
|
||||
if (deletes.getDeletes().contains(keyId)) {
|
||||
throw new IllegalArgumentException(String.format("cannot delete key from mapType: %s", map));
|
||||
}
|
||||
Type newValueType = applyTableChange2Type(map.valueType(), deletes);
|
||||
newValueType = deletes.applyDelete(map.valueId(), newValueType);
|
||||
if (newValueType == null) {
|
||||
throw new IllegalArgumentException(String.format("cannot delete value from mapType: %s", map));
|
||||
}
|
||||
return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional());
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply all the DDL update operations to internalSchema to produce a new internalSchema.
|
||||
*
|
||||
* @param internalSchema origin internalSchema.
|
||||
* @param updates a wrapper class for all the DDL update operations.
|
||||
* @return a new internalSchema.
|
||||
*/
|
||||
public static InternalSchema applyTableChanges2Schema(InternalSchema internalSchema, TableChanges.ColumnUpdateChange updates) {
|
||||
Types.RecordType newType = (Types.RecordType)applyTableChange2Type(internalSchema.getRecord(), updates);
|
||||
// deal with root level changes
|
||||
List<Types.Field> newFields = TableChangesHelper.applyAddChange2Fields(newType.fields(),
|
||||
new ArrayList<>(), updates.getPositionChangeMap().get(-1));
|
||||
return new InternalSchema(newFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply all the DDL update operations to type to produce a new internalSchema.
|
||||
* do not call this method directly. expose this method only for UT.
|
||||
*
|
||||
* @param type origin internalSchema.
|
||||
* @param updates a wrapper class for all the DDL update operations.
|
||||
* @return a new internalSchema.
|
||||
*/
|
||||
private static Type applyTableChange2Type(Type type, TableChanges.ColumnUpdateChange updates) {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
Types.RecordType record = (Types.RecordType) type;
|
||||
List<Type> newTypes = new ArrayList<>();
|
||||
for (Types.Field f : record.fields()) {
|
||||
Type newType = applyTableChange2Type(f.type(), updates);
|
||||
newTypes.add(updates.applyUpdates(f, newType));
|
||||
}
|
||||
List<Types.Field> newFields = new ArrayList<>();
|
||||
for (int i = 0; i < newTypes.size(); i++) {
|
||||
Type newType = newTypes.get(i);
|
||||
Types.Field oldField = record.fields().get(i);
|
||||
Types.Field updateField = updates.getUpdates().get(oldField.fieldId());
|
||||
if (updateField != null) {
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), updateField.isOptional(), updateField.name(), newType, updateField.doc()));
|
||||
} else if (!oldField.type().equals(newType)) {
|
||||
newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
|
||||
} else {
|
||||
newFields.add(oldField);
|
||||
}
|
||||
}
|
||||
return Types.RecordType.get(newFields);
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
Type newElementType;
|
||||
Types.Field elementField = array.fields().get(0);
|
||||
newElementType = applyTableChange2Type(array.elementType(), updates);
|
||||
newElementType = updates.applyUpdates(elementField, newElementType);
|
||||
Types.Field elementUpdate = updates.getUpdates().get(elementField.fieldId());
|
||||
boolean optional = elementUpdate == null ? array.isElementOptional() : elementUpdate.isOptional();
|
||||
if (optional == elementField.isOptional() && array.elementType() == newElementType) {
|
||||
return array;
|
||||
}
|
||||
return Types.ArrayType.get(array.elementId(), optional, newElementType);
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
Types.Field valueFiled = map.fields().get(1);
|
||||
Type newValueType;
|
||||
newValueType = applyTableChange2Type(map.valueType(), updates);
|
||||
newValueType = updates.applyUpdates(valueFiled, newValueType);
|
||||
Types.Field valueUpdate = updates.getUpdates().get(valueFiled.fieldId());
|
||||
boolean valueOptional = valueUpdate == null ? map.isValueOptional() : valueUpdate.isOptional();
|
||||
if (valueOptional == map.isValueOptional() && map.valueType() == newValueType) {
|
||||
return map;
|
||||
}
|
||||
return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, valueOptional);
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,351 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.utils;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonFactory;
|
||||
import com.fasterxml.jackson.core.JsonGenerator;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SerDeHelper {
|
||||
private SerDeHelper() {
|
||||
|
||||
}
|
||||
|
||||
public static final String LATEST_SCHEMA = "latest_schema";
|
||||
public static final String SCHEMAS = "schemas";
|
||||
private static final String MAX_COLUMN_ID = "max_column_id";
|
||||
private static final String VERSION_ID = "version_id";
|
||||
private static final String TYPE = "type";
|
||||
private static final String RECORD = "record";
|
||||
private static final String ARRAY = "array";
|
||||
private static final String MAP = "map";
|
||||
private static final String FIELDS = "fields";
|
||||
private static final String ELEMENT = "element";
|
||||
private static final String KEY = "key";
|
||||
private static final String VALUE = "value";
|
||||
private static final String DOC = "doc";
|
||||
private static final String NAME = "name";
|
||||
private static final String ID = "id";
|
||||
private static final String ELEMENT_ID = "element_id";
|
||||
private static final String KEY_ID = "key_id";
|
||||
private static final String VALUE_ID = "value_id";
|
||||
private static final String OPTIONAL = "optional";
|
||||
private static final String ELEMENT_OPTIONAL = "element_optional";
|
||||
private static final String VALUE_OPTIONAL = "value_optional";
|
||||
|
||||
private static final Pattern FIXED = Pattern.compile("fixed\\[(\\d+)\\]");
|
||||
private static final Pattern DECIMAL = Pattern.compile("decimal\\((\\d+),\\s+(\\d+)\\)");
|
||||
|
||||
/**
|
||||
* Convert history internalSchemas to json.
|
||||
* this is used when save history schemas into hudi.
|
||||
*
|
||||
* @param internalSchemas history internal schemas
|
||||
* @return a string
|
||||
*/
|
||||
public static String toJson(List<InternalSchema> internalSchemas) {
|
||||
try {
|
||||
StringWriter writer = new StringWriter();
|
||||
JsonGenerator generator = (new JsonFactory()).createGenerator(writer);
|
||||
generator.writeStartObject();
|
||||
generator.writeArrayFieldStart(SCHEMAS);
|
||||
for (InternalSchema schema : internalSchemas) {
|
||||
toJson(schema, generator);
|
||||
}
|
||||
generator.writeEndArray();
|
||||
generator.writeEndObject();
|
||||
generator.flush();
|
||||
return writer.toString();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert internalSchemas to json.
|
||||
*
|
||||
* @param internalSchema a internal schema
|
||||
* @return a string
|
||||
*/
|
||||
public static String toJson(InternalSchema internalSchema) {
|
||||
if (internalSchema == null || internalSchema.isEmptySchema()) {
|
||||
return "";
|
||||
}
|
||||
try {
|
||||
StringWriter writer = new StringWriter();
|
||||
JsonGenerator generator = (new JsonFactory()).createGenerator(writer);
|
||||
toJson(internalSchema, generator);
|
||||
generator.flush();
|
||||
return writer.toString();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void toJson(InternalSchema internalSchema, JsonGenerator generator) throws IOException {
|
||||
toJson(internalSchema.getRecord(), internalSchema.getMaxColumnId(), internalSchema.schemaId(), generator);
|
||||
}
|
||||
|
||||
private static void toJson(Types.RecordType record, Integer maxColumnId, Long versionId, JsonGenerator generator) throws IOException {
|
||||
generator.writeStartObject();
|
||||
if (maxColumnId != null) {
|
||||
generator.writeNumberField(MAX_COLUMN_ID, maxColumnId);
|
||||
}
|
||||
if (versionId != null) {
|
||||
generator.writeNumberField(VERSION_ID, versionId);
|
||||
}
|
||||
generator.writeStringField(TYPE, RECORD);
|
||||
generator.writeArrayFieldStart(FIELDS);
|
||||
for (Types.Field field : record.fields()) {
|
||||
generator.writeStartObject();
|
||||
generator.writeNumberField(ID, field.fieldId());
|
||||
generator.writeStringField(NAME, field.name());
|
||||
generator.writeBooleanField(OPTIONAL, field.isOptional());
|
||||
generator.writeFieldName(TYPE);
|
||||
toJson(field.type(), generator);
|
||||
if (field.doc() != null) {
|
||||
generator.writeStringField(DOC, field.doc());
|
||||
}
|
||||
generator.writeEndObject();
|
||||
}
|
||||
generator.writeEndArray();
|
||||
generator.writeEndObject();
|
||||
}
|
||||
|
||||
private static void toJson(Type type, JsonGenerator generator) throws IOException {
|
||||
switch (type.typeId()) {
|
||||
case RECORD:
|
||||
toJson((Types.RecordType) type, null, null, generator);
|
||||
break;
|
||||
case ARRAY:
|
||||
Types.ArrayType array = (Types.ArrayType) type;
|
||||
generator.writeStartObject();
|
||||
generator.writeStringField(TYPE, ARRAY);
|
||||
generator.writeNumberField(ELEMENT_ID, array.elementId());
|
||||
generator.writeFieldName(ELEMENT);
|
||||
toJson(array.elementType(), generator);
|
||||
generator.writeBooleanField(ELEMENT_OPTIONAL, array.isElementOptional());
|
||||
generator.writeEndObject();
|
||||
break;
|
||||
case MAP:
|
||||
Types.MapType map = (Types.MapType) type;
|
||||
generator.writeStartObject();
|
||||
generator.writeStringField(TYPE, MAP);
|
||||
generator.writeNumberField(KEY_ID, map.keyId());
|
||||
generator.writeFieldName(KEY);
|
||||
toJson(map.keyType(), generator);
|
||||
generator.writeNumberField(VALUE_ID, map.valueId());
|
||||
generator.writeFieldName(VALUE);
|
||||
toJson(map.valueType(), generator);
|
||||
generator.writeBooleanField(VALUE_OPTIONAL, map.isValueOptional());
|
||||
generator.writeEndObject();
|
||||
break;
|
||||
default:
|
||||
if (!type.isNestedType()) {
|
||||
generator.writeString(type.toString());
|
||||
} else {
|
||||
throw new IllegalArgumentIOException(String.format("cannot write unknown types: %s", type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Type parserTypeFromJson(JsonNode jsonNode) {
|
||||
if (jsonNode.isTextual()) {
|
||||
String type = jsonNode.asText().toLowerCase(Locale.ROOT);
|
||||
// deal with fixed and decimal
|
||||
Matcher fixed = FIXED.matcher(type);
|
||||
if (fixed.matches()) {
|
||||
return Types.FixedType.getFixed(Integer.parseInt(fixed.group(1)));
|
||||
}
|
||||
Matcher decimal = DECIMAL.matcher(type);
|
||||
if (decimal.matches()) {
|
||||
return Types.DecimalType.get(
|
||||
Integer.parseInt(decimal.group(1)),
|
||||
Integer.parseInt(decimal.group(2)));
|
||||
}
|
||||
// deal with other type
|
||||
switch (Type.fromValue(type)) {
|
||||
case BOOLEAN:
|
||||
return Types.BooleanType.get();
|
||||
case INT:
|
||||
return Types.IntType.get();
|
||||
case LONG:
|
||||
return Types.LongType.get();
|
||||
case FLOAT:
|
||||
return Types.FloatType.get();
|
||||
case DOUBLE:
|
||||
return Types.DoubleType.get();
|
||||
case DATE:
|
||||
return Types.DateType.get();
|
||||
case TIME:
|
||||
return Types.TimeType.get();
|
||||
case TIMESTAMP:
|
||||
return Types.TimestampType.get();
|
||||
case STRING:
|
||||
return Types.StringType.get();
|
||||
case UUID:
|
||||
return Types.UUIDType.get();
|
||||
case BINARY:
|
||||
return Types.BinaryType.get();
|
||||
default:
|
||||
throw new IllegalArgumentException("cannot parser types from jsonNode");
|
||||
}
|
||||
} else if (jsonNode.isObject()) {
|
||||
String typeStr = jsonNode.get(TYPE).asText();
|
||||
if (RECORD.equals(typeStr)) {
|
||||
JsonNode fieldNodes = jsonNode.get(FIELDS);
|
||||
Iterator<JsonNode> iter = fieldNodes.elements();
|
||||
List<Types.Field> fields = new ArrayList<>();
|
||||
while (iter.hasNext()) {
|
||||
JsonNode field = iter.next();
|
||||
// extract
|
||||
int id = field.get(ID).asInt();
|
||||
String name = field.get(NAME).asText();
|
||||
Type type = parserTypeFromJson(field.get(TYPE));
|
||||
String doc = field.has(DOC) ? field.get(DOC).asText() : null;
|
||||
boolean optional = field.get(OPTIONAL).asBoolean();
|
||||
// build fields
|
||||
fields.add(Types.Field.get(id, optional, name, type, doc));
|
||||
}
|
||||
return Types.RecordType.get(fields);
|
||||
} else if (ARRAY.equals(typeStr)) {
|
||||
int elementId = jsonNode.get(ELEMENT_ID).asInt();
|
||||
Type elementType = parserTypeFromJson(jsonNode.get(ELEMENT));
|
||||
boolean optional = jsonNode.get(ELEMENT_OPTIONAL).asBoolean();
|
||||
return Types.ArrayType.get(elementId, optional, elementType);
|
||||
} else if (MAP.equals(typeStr)) {
|
||||
int keyId = jsonNode.get(KEY_ID).asInt();
|
||||
Type keyType = parserTypeFromJson(jsonNode.get(KEY));
|
||||
int valueId = jsonNode.get(VALUE_ID).asInt();
|
||||
Type valueType = parserTypeFromJson(jsonNode.get(VALUE));
|
||||
boolean optional = jsonNode.get(VALUE_OPTIONAL).asBoolean();
|
||||
return Types.MapType.get(keyId, valueId, keyType, valueType, optional);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException(String.format("cannot parse type from jsonNode: %s", jsonNode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert jsonNode to internalSchema.
|
||||
*
|
||||
* @param jsonNode a jsonNode.
|
||||
* @return a internalSchema.
|
||||
*/
|
||||
public static InternalSchema fromJson(JsonNode jsonNode) {
|
||||
Integer maxColumnId = !jsonNode.has(MAX_COLUMN_ID) ? null : jsonNode.get(MAX_COLUMN_ID).asInt();
|
||||
Long versionId = !jsonNode.has(VERSION_ID) ? null : jsonNode.get(VERSION_ID).asLong();
|
||||
Types.RecordType type = (Types.RecordType)parserTypeFromJson(jsonNode);
|
||||
if (versionId == null) {
|
||||
return new InternalSchema(type.fields());
|
||||
} else {
|
||||
if (maxColumnId != null) {
|
||||
return new InternalSchema(versionId, maxColumnId, type.fields());
|
||||
} else {
|
||||
return new InternalSchema(versionId, type.fields());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert string to internalSchema.
|
||||
*
|
||||
* @param json a json string.
|
||||
* @return a internalSchema.
|
||||
*/
|
||||
public static Option<InternalSchema> fromJson(String json) {
|
||||
if (json == null || json.isEmpty()) {
|
||||
return Option.empty();
|
||||
}
|
||||
try {
|
||||
return Option.of(fromJson((new ObjectMapper(new JsonFactory())).readValue(json, JsonNode.class)));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert json string to history internalSchemas.
|
||||
* TreeMap is used to hold history internalSchemas.
|
||||
*
|
||||
* @param json a json string
|
||||
* @return a TreeMap
|
||||
*/
|
||||
public static TreeMap<Long, InternalSchema> parseSchemas(String json) {
|
||||
TreeMap<Long, InternalSchema> result = new TreeMap<>();
|
||||
try {
|
||||
JsonNode jsonNode = (new ObjectMapper(new JsonFactory())).readValue(json, JsonNode.class);
|
||||
if (!jsonNode.has(SCHEMAS)) {
|
||||
throw new IllegalArgumentException(String.format("cannot parser schemas from current json string, missing key name: %s", SCHEMAS));
|
||||
}
|
||||
JsonNode schemas = jsonNode.get(SCHEMAS);
|
||||
Iterator<JsonNode> iter = schemas.elements();
|
||||
while (iter.hasNext()) {
|
||||
JsonNode schema = iter.next();
|
||||
InternalSchema current = fromJson(schema);
|
||||
result.put(current.schemaId(), current);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException(e);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the new schema to the historical schemas.
|
||||
* use string operations to reduce overhead.
|
||||
*
|
||||
* @param newSchema a new internalSchema
|
||||
* @param oldSchemas historical schemas string.
|
||||
* @return a string.
|
||||
*/
|
||||
public static String inheritSchemas(InternalSchema newSchema, String oldSchemas) {
|
||||
if (newSchema == null) {
|
||||
return "";
|
||||
}
|
||||
if (oldSchemas == null || oldSchemas.isEmpty()) {
|
||||
return toJson(Arrays.asList(newSchema));
|
||||
}
|
||||
String checkedString = "{\"schemas\":[";
|
||||
if (!oldSchemas.startsWith("{\"schemas\":")) {
|
||||
return "";
|
||||
}
|
||||
String oldSchemasSuffix = oldSchemas.substring(checkedString.length());
|
||||
return checkedString + toJson(newSchema) + "," + oldSchemasSuffix;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.visitor;
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Base class of schema visitor.
|
||||
*/
|
||||
public abstract class InternalSchemaVisitor<T> {
|
||||
|
||||
public void beforeField(Types.Field field) {
|
||||
}
|
||||
|
||||
public void afterField(Types.Field field) {
|
||||
}
|
||||
|
||||
public void beforeArrayElement(Types.Field elementField) {
|
||||
beforeField(elementField);
|
||||
}
|
||||
|
||||
public void afterArrayElement(Types.Field elementField) {
|
||||
afterField(elementField);
|
||||
}
|
||||
|
||||
public void beforeMapKey(Types.Field keyField) {
|
||||
beforeField(keyField);
|
||||
}
|
||||
|
||||
public void afterMapKey(Types.Field keyField) {
|
||||
afterField(keyField);
|
||||
}
|
||||
|
||||
public void beforeMapValue(Types.Field valueField) {
|
||||
beforeField(valueField);
|
||||
}
|
||||
|
||||
public void afterMapValue(Types.Field valueField) {
|
||||
afterField(valueField);
|
||||
}
|
||||
|
||||
public T schema(InternalSchema schema, T recordResult) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public T record(Types.RecordType record, List<T> fieldResults) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public T field(Types.Field field, T fieldResult) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public T array(Types.ArrayType array, T elementResult) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public T map(Types.MapType map, T keyResult, T valueResult) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public T primitive(Type.PrimitiveType primitive) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.visitor;
|
||||
|
||||
import static org.apache.hudi.internal.schema.utils.InternalSchemaUtils.createFullName;
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Schema visitor to produce name -> id map for internalSchema.
|
||||
*/
|
||||
public class NameToIDVisitor extends InternalSchemaVisitor<Map<String, Integer>> {
|
||||
private final Deque fieldNames = new LinkedList<>();
|
||||
private final Map<String, Integer> nameToId = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public void beforeField(Types.Field field) {
|
||||
fieldNames.push(field.name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterField(Types.Field field) {
|
||||
fieldNames.pop();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void beforeArrayElement(Types.Field elementField) {
|
||||
fieldNames.push(elementField.name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterArrayElement(Types.Field elementField) {
|
||||
fieldNames.pop();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void beforeMapKey(Types.Field keyField) {
|
||||
fieldNames.push(keyField.name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterMapKey(Types.Field keyField) {
|
||||
fieldNames.pop();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void beforeMapValue(Types.Field valueField) {
|
||||
fieldNames.push(valueField.name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterMapValue(Types.Field valueField) {
|
||||
fieldNames.pop();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> schema(InternalSchema schema, Map<String, Integer> recordResult) {
|
||||
return nameToId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> record(Types.RecordType record, List<Map<String, Integer>> fieldResults) {
|
||||
return nameToId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> field(Types.Field field, Map<String, Integer> fieldResult) {
|
||||
nameToId.put(createFullName(field.name(), fieldNames), field.fieldId());
|
||||
return nameToId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> array(Types.ArrayType array, Map<String, Integer> elementResult) {
|
||||
nameToId.put(createFullName("element", fieldNames), array.elementId());
|
||||
return nameToId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> map(Types.MapType map, Map<String, Integer> keyResult, Map<String, Integer> valueResult) {
|
||||
nameToId.put(createFullName("key", fieldNames), map.keyId());
|
||||
nameToId.put(createFullName("value", fieldNames), map.valueId());
|
||||
return nameToId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> primitive(Type.PrimitiveType primitive) {
|
||||
return nameToId;
|
||||
}
|
||||
}
|
||||
@@ -30,6 +30,7 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.SpillableMapUtils;
|
||||
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -65,7 +66,7 @@ public class HoodieMetadataMergedLogRecordReader extends HoodieMergedLogRecordSc
|
||||
Option<InstantRange> instantRange, boolean enableFullScan) {
|
||||
super(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, false, false, bufferSize,
|
||||
spillableMapBasePath, instantRange, false, diskMapType, isBitCaskDiskMapCompressionEnabled, false,
|
||||
enableFullScan, Option.of(partitionName));
|
||||
enableFullScan, Option.of(partitionName), InternalSchema.getEmptyInternalSchema());
|
||||
this.mergeKeyFilter = mergeKeyFilter;
|
||||
if (enableFullScan) {
|
||||
performScan();
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema;
|
||||
|
||||
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
|
||||
import org.apache.hudi.internal.schema.utils.SerDeHelper;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class TestSerDeHelper {
|
||||
|
||||
@Test
|
||||
public void testComplexSchema2Json() {
|
||||
InternalSchema internalSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(7, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
|
||||
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))),
|
||||
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
|
||||
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
|
||||
);
|
||||
// test schema2json
|
||||
String result = SerDeHelper.toJson(internalSchema);
|
||||
InternalSchema convertedSchema = SerDeHelper.fromJson(result).get();
|
||||
Assertions.assertEquals(internalSchema, convertedSchema);
|
||||
// test schemas2json
|
||||
String results = SerDeHelper.toJson(Arrays.asList(internalSchema));
|
||||
TreeMap<Long, InternalSchema> convertedSchemas = SerDeHelper.parseSchemas(results);
|
||||
Assertions.assertEquals(1, convertedSchemas.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrimitive2Json() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "bool", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "int", Types.IntType.get()),
|
||||
Types.Field.get(2, "long", Types.LongType.get()),
|
||||
Types.Field.get(3, "float", Types.FloatType.get()),
|
||||
Types.Field.get(4, "double", Types.DoubleType.get()),
|
||||
Types.Field.get(5, "date", Types.DateType.get()),
|
||||
Types.Field.get(6, "time", Types.TimeType.get()),
|
||||
Types.Field.get(7, "timestamp", Types.TimestampType.get()),
|
||||
Types.Field.get(8, "string", Types.StringType.get()),
|
||||
Types.Field.get(9, "uuid", Types.UUIDType.get()),
|
||||
Types.Field.get(10, "fixed", Types.FixedType.getFixed(10)),
|
||||
Types.Field.get(11, "binary", Types.BinaryType.get()),
|
||||
Types.Field.get(12, "decimal", Types.DecimalType.get(10, 2))
|
||||
}));
|
||||
InternalSchema internalSchema = new InternalSchema(record.fields());
|
||||
String result = SerDeHelper.toJson(internalSchema);
|
||||
InternalSchema convertedSchema = SerDeHelper.fromJson(result).get();
|
||||
Assertions.assertEquals(internalSchema, convertedSchema);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSearchSchema() {
|
||||
List schemas = new ArrayList<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
schemas.add(new InternalSchema(i * 10,
|
||||
Arrays.asList(Types.Field.get(1, true, "schema" + i * 10, Types.LongType.get()))));
|
||||
}
|
||||
|
||||
Assertions.assertEquals(InternalSchemaUtils.searchSchema(0, schemas).getRecord().fields().get(0),
|
||||
Types.Field.get(1, true, "schema" + 0, Types.LongType.get()));
|
||||
|
||||
Assertions.assertEquals(InternalSchemaUtils.searchSchema(9, schemas).getRecord().fields().get(0),
|
||||
Types.Field.get(1, true, "schema" + 0, Types.LongType.get()));
|
||||
|
||||
Assertions.assertEquals(InternalSchemaUtils.searchSchema(99, schemas).getRecord().fields().get(0),
|
||||
Types.Field.get(1, true, "schema" + 90, Types.LongType.get()));
|
||||
|
||||
Assertions.assertEquals(InternalSchemaUtils.searchSchema(9999, schemas).getRecord().fields().get(0),
|
||||
Types.Field.get(1, true, "schema" + 990, Types.LongType.get()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInheritSchemas() {
|
||||
List schemas = new ArrayList<>();
|
||||
for (int i = 0; i < 2; i++) {
|
||||
schemas.add(new InternalSchema(i,
|
||||
Arrays.asList(Types.Field.get(1, true, "schema" + i, Types.LongType.get()))));
|
||||
}
|
||||
String oldSchemas = SerDeHelper.toJson(schemas);
|
||||
InternalSchema newSchema = new InternalSchema(3,
|
||||
Arrays.asList(Types.Field.get(1, true, "schema" + 3, Types.LongType.get())));
|
||||
|
||||
String finalResult = SerDeHelper.inheritSchemas(newSchema, oldSchemas);
|
||||
// convert back
|
||||
Assertions.assertEquals(SerDeHelper.parseSchemas(finalResult).size(), 3);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import org.apache.hudi.internal.schema.utils.SchemaChangeUtils;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestMergeSchema {
|
||||
|
||||
@Test
|
||||
public void testPrimitiveMerge() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "col1", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "col2", Types.IntType.get()),
|
||||
Types.Field.get(2, "col3", Types.LongType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get())}));
|
||||
|
||||
InternalSchema oldSchema = new InternalSchema(record.fields());
|
||||
// add c1 after 'col1', and c2 before 'col3'
|
||||
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
|
||||
addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1");
|
||||
addChange.addPositionChange("c1", "col1", "after");
|
||||
addChange.addColumns("c2", Types.IntType.get(), "add c2 before col3");
|
||||
addChange.addPositionChange("c2", "col3", "before");
|
||||
InternalSchema newAddSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
|
||||
TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(newAddSchema);
|
||||
deleteChange.deleteColumn("col1");
|
||||
deleteChange.deleteColumn("col3");
|
||||
InternalSchema newDeleteSchema = SchemaChangeUtils.applyTableChanges2Schema(newAddSchema, deleteChange);
|
||||
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(newDeleteSchema);
|
||||
updateChange.updateColumnType("col2", Types.LongType.get())
|
||||
.updateColumnComment("col2", "alter col2 comments")
|
||||
.renameColumn("col2", "colx").addPositionChange("col2",
|
||||
"col4", "after");
|
||||
InternalSchema updateSchema = SchemaChangeUtils.applyTableChanges2Schema(newDeleteSchema, updateChange);
|
||||
|
||||
// add col1 again
|
||||
TableChanges.ColumnAddChange addChange1 = TableChanges.ColumnAddChange.get(updateSchema);
|
||||
addChange1.addColumns("col1", Types.BooleanType.get(), "add new col1");
|
||||
InternalSchema finalSchema = SchemaChangeUtils.applyTableChanges2Schema(updateSchema, addChange1);
|
||||
// merge schema by using columnType from query schema
|
||||
InternalSchema mergeSchema = new InternalSchemaMerger(oldSchema, finalSchema, true, false).mergeSchema();
|
||||
|
||||
InternalSchema checkedSchema = new InternalSchema(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"),
|
||||
Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"),
|
||||
Types.Field.get(3, true, "col4", Types.FloatType.get()),
|
||||
Types.Field.get(1, true, "col2", Types.LongType.get(), "alter col2 comments"),
|
||||
Types.Field.get(6, true, "col1suffix", Types.BooleanType.get(), "add new col1")
|
||||
}));
|
||||
Assertions.assertEquals(mergeSchema, checkedSchema);
|
||||
|
||||
// merge schema by using columnType from file schema
|
||||
InternalSchema mergeSchema1 = new InternalSchemaMerger(oldSchema, finalSchema, true, true).mergeSchema();
|
||||
InternalSchema checkedSchema1 = new InternalSchema(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"),
|
||||
Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"),
|
||||
Types.Field.get(3, true, "col4", Types.FloatType.get()),
|
||||
Types.Field.get(1, true, "col2", Types.IntType.get(), "alter col2 comments"),
|
||||
Types.Field.get(6, true, "col1suffix", Types.BooleanType.get(), "add new col1")
|
||||
}));
|
||||
Assertions.assertEquals(mergeSchema1, checkedSchema1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.action;
|
||||
|
||||
import org.apache.hudi.internal.schema.HoodieSchemaException;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
|
||||
import org.apache.hudi.internal.schema.utils.SchemaChangeUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestTableChanges {
|
||||
|
||||
@Test
|
||||
public void testPrimitiveAdd() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "col1", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "col2", Types.IntType.get()),
|
||||
Types.Field.get(2, "col3", Types.LongType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get())}));
|
||||
|
||||
Types.RecordType checkRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "col1", Types.BooleanType.get()),
|
||||
Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"),
|
||||
Types.Field.get(1, "col2", Types.IntType.get()),
|
||||
Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"),
|
||||
Types.Field.get(2, "col3", Types.LongType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get())}));
|
||||
|
||||
InternalSchema oldSchema = new InternalSchema(record.fields());
|
||||
// add c1 after 'col1', and c2 before 'col3'
|
||||
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
|
||||
addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1");
|
||||
// check repeated add.
|
||||
Assertions.assertThrows(HoodieSchemaException.class, () -> addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1"));
|
||||
addChange.addPositionChange("c1", "col1", "after");
|
||||
addChange.addColumns("c2", Types.IntType.get(), "add c2 before col3");
|
||||
addChange.addPositionChange("c2", "col3", "before");
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
|
||||
Assertions.assertEquals(newSchema.getRecord(), checkRecord);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNestAdd() {
|
||||
InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(7, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
|
||||
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))),
|
||||
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
|
||||
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
|
||||
);
|
||||
|
||||
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
|
||||
// add c1 first
|
||||
addChange.addColumns("c1", Types.StringType.get(), "add c1 first");
|
||||
addChange.addPositionChange("c1", "id", "before");
|
||||
//add preferences.cx before preferences.feature2
|
||||
addChange.addColumns("preferences", "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2");
|
||||
// check repeated add.
|
||||
Assertions.assertThrows(HoodieSchemaException.class, () -> addChange.addColumns("preferences", "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"));
|
||||
addChange.addPositionChange("preferences.cx", "preferences.feature2", "before");
|
||||
// add locations.value.lax before locations.value.long
|
||||
addChange.addColumns("locations.value", "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long");
|
||||
addChange.addPositionChange("locations.value.lax", "locations.value.long", "before");
|
||||
//
|
||||
// add points.element.z after points.element.y
|
||||
addChange.addColumns("points.element", "z", Types.BooleanType.get(), "add points.element.z after points.element.y");
|
||||
addChange.addPositionChange("points.element.z", "points.element.y", "after");
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
|
||||
InternalSchema checkedSchema = new InternalSchema(
|
||||
Types.Field.get(19, true, "c1", Types.StringType.get(), "add c1 first"),
|
||||
Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(7, false, "feature1", Types.BooleanType.get()),
|
||||
Types.Field.get(20, true, "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"),
|
||||
Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()),
|
||||
Types.Field.get(21, true, "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long"),
|
||||
Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
|
||||
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()),
|
||||
Types.Field.get(15, false, "y", Types.LongType.get()),
|
||||
Types.Field.get(22, true, "z", Types.BooleanType.get(), "add points.element.z after points.element.y")))),
|
||||
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
|
||||
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
|
||||
);
|
||||
Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrimitiveDelete() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "col1", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "col2", Types.IntType.get()),
|
||||
Types.Field.get(2, "col3", Types.LongType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get())}));
|
||||
InternalSchema oldSchema = new InternalSchema(record.fields());
|
||||
TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(oldSchema);
|
||||
deleteChange.deleteColumn("col1");
|
||||
// check repeated delete.
|
||||
// deletechange can handle deleting the same column multiple times, only keep one operation.
|
||||
deleteChange.deleteColumn("col1");
|
||||
deleteChange.deleteColumn("col3");
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, deleteChange);
|
||||
Types.RecordType checkRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(1, "col2", Types.IntType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get())}));
|
||||
Assertions.assertEquals(newSchema.getRecord(), checkRecord);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNestDelete() {
|
||||
InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(5, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(9, false, "lat", Types.FloatType.get()), Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
|
||||
Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()), Types.Field.get(13, false, "y", Types.LongType.get()))))
|
||||
);
|
||||
TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(oldSchema);
|
||||
deleteChange.deleteColumn("data");
|
||||
deleteChange.deleteColumn("preferences.feature2");
|
||||
deleteChange.deleteColumn("preferences.feature2");
|
||||
deleteChange.deleteColumn("locations.value.lat");
|
||||
deleteChange.deleteColumn("points.element.y");
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, deleteChange);
|
||||
InternalSchema checkedSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(5, false, "feature1",
|
||||
Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
|
||||
Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()))))
|
||||
);
|
||||
Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrimitiveUpdate() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "col1", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "col2", Types.IntType.get()),
|
||||
Types.Field.get(2, "col3", Types.LongType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get())}));
|
||||
InternalSchema oldSchema = new InternalSchema(record.fields());
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(oldSchema);
|
||||
updateChange.updateColumnType("col2", Types.LongType.get())
|
||||
.updateColumnComment("col2", "alter col2 comments")
|
||||
.renameColumn("col2", "colx").addPositionChange("col2", "col4", "after");
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, updateChange);
|
||||
Types.RecordType checkedRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "col1", Types.BooleanType.get()),
|
||||
Types.Field.get(2, "col3", Types.LongType.get()),
|
||||
Types.Field.get(3, "col4", Types.FloatType.get()),
|
||||
Types.Field.get(1, true, "colx", Types.LongType.get(), "alter col2 comments")}));
|
||||
Assertions.assertEquals(newSchema.getRecord(), checkedRecord);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNestUpdate() {
|
||||
InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(5, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(9, false, "lat", Types.FloatType.get()), Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
|
||||
Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()), Types.Field.get(13, false, "y", Types.LongType.get()))))
|
||||
);
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(oldSchema);
|
||||
updateChange
|
||||
.updateColumnNullability("id", true)
|
||||
.renameColumn("id", "idx")
|
||||
.addPositionChange("data", "points", "after");
|
||||
updateChange
|
||||
.updateColumnComment("preferences.feature1", "add feature1 comment")
|
||||
.renameColumn("preferences.feature1", "f1")
|
||||
.addPositionChange("preferences.feature1", "preferences.feature1", "first");
|
||||
updateChange.updateColumnComment("locations.value.lat", "add lat comment")
|
||||
.renameColumn("locations.value.lat", "lax")
|
||||
.addPositionChange("locations.value.lat", "locations.value.lat", "first");
|
||||
updateChange.renameColumn("points.element.x", "z")
|
||||
.addPositionChange("points.element.x", "points.element.y", "after");
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, updateChange);
|
||||
InternalSchema checkSchema = new InternalSchema(Types.Field.get(0, true, "idx", Types.IntType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(5, false, "f1",
|
||||
Types.BooleanType.get(), "add feature1 comment"), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(9, false, "lax", Types.FloatType.get(), "add lat comment"), Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
|
||||
Types.RecordType.get(Types.Field.get(13, false, "y", Types.LongType.get()), Types.Field.get(12, false, "z", Types.LongType.get())))),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get())
|
||||
);
|
||||
Assertions.assertEquals(newSchema.getRecord(), checkSchema.getRecord());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.io;
|
||||
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.utils.SerDeHelper;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
/**
|
||||
* Tests {@link FileBasedInternalSchemaStorageManager}.
|
||||
*/
|
||||
public class TestFileBasedInternalSchemaStorageManager extends HoodieCommonTestHarness {
|
||||
private HoodieActiveTimeline timeline;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
initMetaClient();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPersistAndReadHistorySchemaStr() throws IOException {
|
||||
timeline = new HoodieActiveTimeline(metaClient);
|
||||
FileBasedInternalSchemaStorageManager fm = new FileBasedInternalSchemaStorageManager(metaClient);
|
||||
InternalSchema currentSchema = getSimpleSchema();
|
||||
currentSchema.setSchemaId(0L);
|
||||
// save first schema.
|
||||
fm.persistHistorySchemaStr("0000", SerDeHelper.inheritSchemas(currentSchema, ""));
|
||||
// Simulate commit.
|
||||
simulateCommit("0000");
|
||||
metaClient.reloadActiveTimeline();
|
||||
// try to read schema
|
||||
InternalSchema readSchema = fm.getSchemaByKey("0").get();
|
||||
assertEquals(currentSchema, readSchema);
|
||||
// save history schema again
|
||||
InternalSchema secondSchema = getSimpleSchema();
|
||||
secondSchema.setSchemaId(1L);
|
||||
fm.persistHistorySchemaStr("0001", SerDeHelper.inheritSchemas(secondSchema, fm.getHistorySchemaStr()));
|
||||
// Simulate commit.
|
||||
simulateCommit("0001");
|
||||
metaClient.reloadActiveTimeline();
|
||||
// try to read schema
|
||||
assertEquals(secondSchema, fm.getSchemaByKey("1").get());
|
||||
|
||||
// test write failed and residual file clean.
|
||||
InternalSchema thirdSchema = getSimpleSchema();
|
||||
thirdSchema.setSchemaId(2L);
|
||||
fm.persistHistorySchemaStr("0002", SerDeHelper.inheritSchemas(thirdSchema, fm.getHistorySchemaStr()));
|
||||
// do not simulate commit "0002", so current save file will be residual files.
|
||||
// try 4st persist
|
||||
InternalSchema lastSchema = getSimpleSchema();
|
||||
lastSchema.setSchemaId(3L);
|
||||
fm.persistHistorySchemaStr("0004", SerDeHelper.inheritSchemas(lastSchema, fm.getHistorySchemaStr()));
|
||||
simulateCommit("0004");
|
||||
metaClient.reloadActiveTimeline();
|
||||
// now the residual file created by 3st persist should be removed.
|
||||
File f = new File(metaClient.getSchemaFolderName() + File.separator + "0002.schemacommit");
|
||||
assertTrue(!f.exists());
|
||||
assertEquals(lastSchema, fm.getSchemaByKey("3").get());
|
||||
}
|
||||
|
||||
private void simulateCommit(String commitTime) {
|
||||
if (timeline == null) {
|
||||
timeline = new HoodieActiveTimeline(metaClient);
|
||||
}
|
||||
HoodieInstant instant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
timeline.createNewInstant(instant);
|
||||
timeline.transitionRequestedToInflight(instant, Option.empty());
|
||||
timeline.saveAsComplete(new HoodieInstant(true, instant.getAction(), instant.getTimestamp()),
|
||||
Option.empty());
|
||||
}
|
||||
|
||||
private InternalSchema getSimpleSchema() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "bool", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "int", Types.IntType.get()),
|
||||
}));
|
||||
return new InternalSchema(record.fields());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,422 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.utils;
|
||||
|
||||
import org.apache.avro.JsonProperties;
|
||||
import org.apache.avro.LogicalTypes;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.SchemaBuilder;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.InternalSchemaBuilder;
|
||||
import org.apache.hudi.internal.schema.Type;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.apache.hudi.internal.schema.action.TableChanges;
|
||||
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class TestAvroSchemaEvolutionUtils {
|
||||
|
||||
@Test
|
||||
public void testPrimitiveTypes() {
|
||||
Schema[] avroPrimitives = new Schema[] {
|
||||
Schema.create(Schema.Type.BOOLEAN),
|
||||
Schema.create(Schema.Type.INT),
|
||||
Schema.create(Schema.Type.LONG),
|
||||
Schema.create(Schema.Type.FLOAT),
|
||||
Schema.create(Schema.Type.DOUBLE),
|
||||
LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)),
|
||||
LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)),
|
||||
LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)),
|
||||
Schema.create(Schema.Type.STRING),
|
||||
LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)),
|
||||
Schema.createFixed("fixed_12", null, null, 12),
|
||||
Schema.create(Schema.Type.BYTES),
|
||||
LogicalTypes.decimal(9, 4).addToSchema(Schema.createFixed("decimal_9_4", null, null, 4))};
|
||||
|
||||
Type[] primitiveTypes = new Type[] {
|
||||
Types.BooleanType.get(),
|
||||
Types.IntType.get(),
|
||||
Types.LongType.get(),
|
||||
Types.FloatType.get(),
|
||||
Types.DoubleType.get(),
|
||||
Types.DateType.get(),
|
||||
Types.TimeType.get(),
|
||||
Types.TimestampType.get(),
|
||||
Types.StringType.get(),
|
||||
Types.UUIDType.get(),
|
||||
Types.FixedType.getFixed(12),
|
||||
Types.BinaryType.get(),
|
||||
Types.DecimalType.get(9, 4)
|
||||
};
|
||||
|
||||
for (int i = 0; i < primitiveTypes.length; i++) {
|
||||
Type convertPrimitiveResult = AvroInternalSchemaConverter.convertToField(avroPrimitives[i]);
|
||||
Assertions.assertEquals(convertPrimitiveResult, primitiveTypes[i]);
|
||||
Schema convertResult = AvroInternalSchemaConverter.convert(primitiveTypes[i], "t1");
|
||||
Assertions.assertEquals(convertResult, avroPrimitives[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecordAndPrimitiveTypes() {
|
||||
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "bool", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "int", Types.IntType.get()),
|
||||
Types.Field.get(2, "long", Types.LongType.get()),
|
||||
Types.Field.get(3, "float", Types.FloatType.get()),
|
||||
Types.Field.get(4, "double", Types.DoubleType.get()),
|
||||
Types.Field.get(5, "date", Types.DateType.get()),
|
||||
Types.Field.get(6, "time", Types.TimeType.get()),
|
||||
Types.Field.get(7, "timestamp", Types.TimestampType.get()),
|
||||
Types.Field.get(8, "string", Types.StringType.get()),
|
||||
Types.Field.get(9, "uuid", Types.UUIDType.get()),
|
||||
Types.Field.get(10, "fixed", Types.FixedType.getFixed(10)),
|
||||
Types.Field.get(11, "binary", Types.BinaryType.get()),
|
||||
Types.Field.get(12, "decimal", Types.DecimalType.get(10, 2))
|
||||
}));
|
||||
|
||||
Schema schema = create("t1",
|
||||
new Schema.Field("bool", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BOOLEAN)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("int", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.INT)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("long", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("float", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.FLOAT)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("double", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.DOUBLE)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("date", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("time", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG))), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("timestamp", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("string", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.STRING)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("uuid", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16))), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("fixed", AvroInternalSchemaConverter.nullableSchema(Schema.createFixed("fixed_10", null, null, 10)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("binary", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BYTES)), null, JsonProperties.NULL_VALUE),
|
||||
new Schema.Field("decimal", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.decimal(10, 2)
|
||||
.addToSchema(Schema.createFixed("decimal_10_2", null, null, 5))), null, JsonProperties.NULL_VALUE));
|
||||
Schema convertedSchema = AvroInternalSchemaConverter.convert(record, "t1");
|
||||
Assertions.assertEquals(convertedSchema, schema);
|
||||
Types.RecordType convertedRecord = AvroInternalSchemaConverter.convert(schema).getRecord();
|
||||
Assertions.assertEquals(convertedRecord, record);
|
||||
}
|
||||
|
||||
private Schema create(String name, Schema.Field... fields) {
|
||||
return Schema.createRecord(name, null, null, false, Arrays.asList(fields));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArrayType() {
|
||||
Type arrayNestRecordType = Types.ArrayType.get(1, false,
|
||||
Types.RecordType.get(Arrays.asList(Types.Field.get(2, false, "a", Types.FloatType.get()),
|
||||
Types.Field.get(3, false, "b", Types.FloatType.get()))));
|
||||
|
||||
Schema schema = SchemaBuilder.array().items(create("t1",
|
||||
new Schema.Field("a", Schema.create(Schema.Type.FLOAT), null, null),
|
||||
new Schema.Field("b", Schema.create(Schema.Type.FLOAT), null, null)));
|
||||
Schema convertedSchema = AvroInternalSchemaConverter.convert(arrayNestRecordType, "t1");
|
||||
Assertions.assertEquals(convertedSchema, schema);
|
||||
Types.ArrayType convertedRecord = (Types.ArrayType) AvroInternalSchemaConverter.convertToField(schema);
|
||||
Assertions.assertEquals(convertedRecord, arrayNestRecordType);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComplexConvert() {
|
||||
String schemaStr = "{\"type\":\"record\",\"name\":\"newTableName\",\"fields\":[{\"name\":\"id\",\"type\":\"int\"},{\"name\":\"data\","
|
||||
+ "\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"preferences\",\"type\":[\"null\","
|
||||
+ "{\"type\":\"record\",\"name\":\"newTableName_preferences\",\"fields\":[{\"name\":\"feature1\","
|
||||
+ "\"type\":\"boolean\"},{\"name\":\"feature2\",\"type\":[\"null\",\"boolean\"],\"default\":null}]}],"
|
||||
+ "\"default\":null},{\"name\":\"locations\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"record\","
|
||||
+ "\"name\":\"newTableName_locations\",\"fields\":[{\"name\":\"lat\",\"type\":\"float\"},{\"name\":\"long\","
|
||||
+ "\"type\":\"float\"}]}}},{\"name\":\"points\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"null\","
|
||||
+ "{\"type\":\"record\",\"name\":\"newTableName_points\",\"fields\":[{\"name\":\"x\",\"type\":\"long\"},"
|
||||
+ "{\"name\":\"y\",\"type\":\"long\"}]}]}],\"default\":null},{\"name\":\"doubles\",\"type\":{\"type\":\"array\",\"items\":\"double\"}},"
|
||||
+ "{\"name\":\"properties\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"null\",\"string\"]}],\"default\":null}]}";
|
||||
Schema schema = new Schema.Parser().parse(schemaStr);
|
||||
|
||||
InternalSchema internalSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(7, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
|
||||
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
|
||||
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))),
|
||||
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
|
||||
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
|
||||
);
|
||||
|
||||
Type convertRecord = AvroInternalSchemaConverter.convert(schema).getRecord();
|
||||
Assertions.assertEquals(convertRecord, internalSchema.getRecord());
|
||||
Assertions.assertEquals(schema, AvroInternalSchemaConverter.convert(internalSchema, "newTableName"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRefreshNewId() {
|
||||
Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(4, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(5, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false, "locations", Types.MapType.get(6, 7, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(8, false, "lat", Types.FloatType.get()), Types.Field.get(9, false, "long", Types.FloatType.get())), false))
|
||||
);
|
||||
AtomicInteger newId = new AtomicInteger(100);
|
||||
Types.RecordType recordWithNewId = (Types.RecordType) InternalSchemaBuilder.getBuilder().refreshNewId(record, newId);
|
||||
|
||||
Types.RecordType newRecord = Types.RecordType.get(Types.Field.get(100, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(101, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(102, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(104, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(105, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(103, false, "locations", Types.MapType.get(106, 107, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(108, false, "lat", Types.FloatType.get()), Types.Field.get(109, false, "long", Types.FloatType.get())), false))
|
||||
);
|
||||
Assertions.assertEquals(newRecord, recordWithNewId);
|
||||
}
|
||||
|
||||
/**
|
||||
* test record data type changes.
|
||||
* int => long/float/double/string
|
||||
* long => float/double/string
|
||||
* float => double/String
|
||||
* double => String/Decimal
|
||||
* Decimal => Decimal/String
|
||||
* String => date/decimal
|
||||
* date => String
|
||||
*/
|
||||
@Test
|
||||
public void testReWriteRecordWithTypeChanged() {
|
||||
Schema avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"h0_record\",\"namespace\":\"hoodie.h0\",\"fields\""
|
||||
+ ":[{\"name\":\"id\",\"type\":[\"null\",\"int\"],\"default\":null},"
|
||||
+ "{\"name\":\"comb\",\"type\":[\"null\",\"int\"],\"default\":null},"
|
||||
+ "{\"name\":\"com1\",\"type\":[\"null\",\"int\"],\"default\":null},"
|
||||
+ "{\"name\":\"col0\",\"type\":[\"null\",\"int\"],\"default\":null},"
|
||||
+ "{\"name\":\"col1\",\"type\":[\"null\",\"long\"],\"default\":null},"
|
||||
+ "{\"name\":\"col11\",\"type\":[\"null\",\"long\"],\"default\":null},"
|
||||
+ "{\"name\":\"col12\",\"type\":[\"null\",\"long\"],\"default\":null},"
|
||||
+ "{\"name\":\"col2\",\"type\":[\"null\",\"float\"],\"default\":null},"
|
||||
+ "{\"name\":\"col21\",\"type\":[\"null\",\"float\"],\"default\":null},"
|
||||
+ "{\"name\":\"col3\",\"type\":[\"null\",\"double\"],\"default\":null},"
|
||||
+ "{\"name\":\"col31\",\"type\":[\"null\",\"double\"],\"default\":null},"
|
||||
+ "{\"name\":\"col4\",\"type\":[\"null\",{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"hoodie.h0.h0_record.col4\","
|
||||
+ "\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":4}],\"default\":null},"
|
||||
+ "{\"name\":\"col41\",\"type\":[\"null\",{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"hoodie.h0.h0_record.col41\","
|
||||
+ "\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":4}],\"default\":null},"
|
||||
+ "{\"name\":\"col5\",\"type\":[\"null\",\"string\"],\"default\":null},"
|
||||
+ "{\"name\":\"col51\",\"type\":[\"null\",\"string\"],\"default\":null},"
|
||||
+ "{\"name\":\"col6\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null},"
|
||||
+ "{\"name\":\"col7\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}],\"default\":null},"
|
||||
+ "{\"name\":\"col8\",\"type\":[\"null\",\"boolean\"],\"default\":null},"
|
||||
+ "{\"name\":\"col9\",\"type\":[\"null\",\"bytes\"],\"default\":null},{\"name\":\"par\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}]}");
|
||||
// create a test record with avroSchema
|
||||
GenericData.Record avroRecord = new GenericData.Record(avroSchema);
|
||||
avroRecord.put("id", 1);
|
||||
avroRecord.put("comb", 100);
|
||||
avroRecord.put("com1", -100);
|
||||
avroRecord.put("col0", 256);
|
||||
avroRecord.put("col1", 1000L);
|
||||
avroRecord.put("col11", -100L);
|
||||
avroRecord.put("col12", 2000L);
|
||||
avroRecord.put("col2", -5.001f);
|
||||
avroRecord.put("col21", 5.001f);
|
||||
avroRecord.put("col3", 12.999d);
|
||||
avroRecord.put("col31", 9999.999d);
|
||||
Schema currentDecimalType = avroSchema.getField("col4").schema().getTypes().get(1);
|
||||
BigDecimal bd = new BigDecimal("123.456").setScale(((LogicalTypes.Decimal) currentDecimalType.getLogicalType()).getScale());
|
||||
avroRecord.put("col4", HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd, currentDecimalType, currentDecimalType.getLogicalType()));
|
||||
Schema currentDecimalType1 = avroSchema.getField("col41").schema().getTypes().get(1);
|
||||
BigDecimal bd1 = new BigDecimal("7890.456").setScale(((LogicalTypes.Decimal) currentDecimalType1.getLogicalType()).getScale());
|
||||
avroRecord.put("col41", HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd1, currentDecimalType1, currentDecimalType1.getLogicalType()));
|
||||
|
||||
avroRecord.put("col5", "2011-01-01");
|
||||
avroRecord.put("col51", "199.342");
|
||||
avroRecord.put("col6", 18987);
|
||||
avroRecord.put("col7", 1640491505000000L);
|
||||
avroRecord.put("col8", false);
|
||||
ByteBuffer bb = ByteBuffer.wrap(new byte[] {97, 48, 53});
|
||||
avroRecord.put("col9", bb);
|
||||
Assertions.assertEquals(GenericData.get().validate(avroSchema, avroRecord), true);
|
||||
InternalSchema internalSchema = AvroInternalSchemaConverter.convert(avroSchema);
|
||||
// do change type operation
|
||||
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(internalSchema);
|
||||
updateChange
|
||||
.updateColumnType("id", Types.LongType.get())
|
||||
.updateColumnType("comb", Types.FloatType.get())
|
||||
.updateColumnType("com1", Types.DoubleType.get())
|
||||
.updateColumnType("col0", Types.StringType.get())
|
||||
.updateColumnType("col1", Types.FloatType.get())
|
||||
.updateColumnType("col11", Types.DoubleType.get())
|
||||
.updateColumnType("col12", Types.StringType.get())
|
||||
.updateColumnType("col2", Types.DoubleType.get())
|
||||
.updateColumnType("col21", Types.StringType.get())
|
||||
.updateColumnType("col3", Types.StringType.get())
|
||||
.updateColumnType("col31", Types.DecimalType.get(18, 9))
|
||||
.updateColumnType("col4", Types.DecimalType.get(18, 9))
|
||||
.updateColumnType("col41", Types.StringType.get())
|
||||
.updateColumnType("col5", Types.DateType.get())
|
||||
.updateColumnType("col51", Types.DecimalType.get(18, 9))
|
||||
.updateColumnType("col6", Types.StringType.get());
|
||||
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange);
|
||||
Schema newAvroSchema = AvroInternalSchemaConverter.convert(newSchema, avroSchema.getName());
|
||||
GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema);
|
||||
|
||||
Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newRecord), true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReWriteNestRecord() {
|
||||
Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(5, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
|
||||
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(10, false, "lat", Types.FloatType.get()), Types.Field.get(11, false, "long", Types.FloatType.get())), false))
|
||||
);
|
||||
Schema schema = AvroInternalSchemaConverter.convert(record, "test1");
|
||||
GenericData.Record avroRecord = new GenericData.Record(schema);
|
||||
GenericData.get().validate(schema, avroRecord);
|
||||
avroRecord.put("id", 2);
|
||||
avroRecord.put("data", "xs");
|
||||
// fill record type
|
||||
GenericData.Record preferencesRecord = new GenericData.Record(AvroInternalSchemaConverter.convert(record.fieldType("preferences"), "test1_preferences"));
|
||||
preferencesRecord.put("feature1", false);
|
||||
preferencesRecord.put("feature2", true);
|
||||
Assertions.assertEquals(GenericData.get().validate(AvroInternalSchemaConverter.convert(record.fieldType("preferences"), "test1_preferences"), preferencesRecord), true);
|
||||
avroRecord.put("preferences", preferencesRecord);
|
||||
// fill mapType
|
||||
Map<String, GenericData.Record> locations = new HashMap<>();
|
||||
Schema mapSchema = AvroInternalSchemaConverter.convert(((Types.MapType)record.field("locations").type()).valueType(), "test1_locations");
|
||||
GenericData.Record locationsValue = new GenericData.Record(mapSchema);
|
||||
locationsValue.put("lat", 1.2f);
|
||||
locationsValue.put("long", 1.4f);
|
||||
GenericData.Record locationsValue1 = new GenericData.Record(mapSchema);
|
||||
locationsValue1.put("lat", 2.2f);
|
||||
locationsValue1.put("long", 2.4f);
|
||||
locations.put("key1", locationsValue);
|
||||
locations.put("key2", locationsValue1);
|
||||
avroRecord.put("locations", locations);
|
||||
|
||||
List<Double> doubles = new ArrayList<>();
|
||||
doubles.add(2.0d);
|
||||
doubles.add(3.0d);
|
||||
avroRecord.put("doubles", doubles);
|
||||
|
||||
// do check
|
||||
Assertions.assertEquals(GenericData.get().validate(schema, avroRecord), true);
|
||||
// create newSchema
|
||||
Types.RecordType newRecord = Types.RecordType.get(
|
||||
Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
|
||||
Types.Field.get(5, true, "featurex", Types.BooleanType.get()),
|
||||
Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
|
||||
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(10, true, "laty", Types.FloatType.get()),
|
||||
Types.Field.get(11, false, "long", Types.FloatType.get())), false)
|
||||
)
|
||||
);
|
||||
|
||||
Schema newAvroSchema = AvroInternalSchemaConverter.convert(newRecord, schema.getName());
|
||||
GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema);
|
||||
// test the correctly of rewrite
|
||||
Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newAvroRecord), true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvolutionSchemaFromNewAvroSchema() {
|
||||
Types.RecordType oldRecord = Types.RecordType.get(
|
||||
Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
|
||||
Types.Field.get(6, true, "featurex", Types.BooleanType.get()),
|
||||
Types.Field.get(7, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false,"doubles", Types.ArrayType.get(8, false, Types.DoubleType.get())),
|
||||
Types.Field.get(4, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(11, false, "laty", Types.FloatType.get()),
|
||||
Types.Field.get(12, false, "long", Types.FloatType.get())), false)
|
||||
)
|
||||
);
|
||||
InternalSchema oldSchema = new InternalSchema(oldRecord.fields());
|
||||
Types.RecordType evolvedRecord = Types.RecordType.get(
|
||||
Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
|
||||
Types.Field.get(5, true, "featurex", Types.BooleanType.get()),
|
||||
Types.Field.get(6, true, "feature2", Types.BooleanType.get()),
|
||||
Types.Field.get(5, true, "feature3", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
|
||||
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(10, false, "laty", Types.FloatType.get()),
|
||||
Types.Field.get(11, false, "long", Types.FloatType.get())), false)
|
||||
),
|
||||
Types.Field.get(0, false, "add1", Types.IntType.get()),
|
||||
Types.Field.get(2, true, "addStruct",
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(5, false, "nest1", Types.BooleanType.get()),
|
||||
Types.Field.get(5, true, "nest2", Types.BooleanType.get())))
|
||||
);
|
||||
evolvedRecord = (Types.RecordType)InternalSchemaBuilder.getBuilder().refreshNewId(evolvedRecord, new AtomicInteger(0));
|
||||
Schema evolvedAvroSchema = AvroInternalSchemaConverter.convert(evolvedRecord, "test1");
|
||||
InternalSchema result = AvroSchemaEvolutionUtils.evolveSchemaFromNewAvroSchema(evolvedAvroSchema, oldSchema);
|
||||
Types.RecordType checkedRecord = Types.RecordType.get(
|
||||
Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
|
||||
Types.Field.get(6, true, "featurex", Types.BooleanType.get()),
|
||||
Types.Field.get(7, true, "feature2", Types.BooleanType.get()),
|
||||
Types.Field.get(17, true, "feature3", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false,"doubles", Types.ArrayType.get(8, false, Types.DoubleType.get())),
|
||||
Types.Field.get(4, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(11, false, "laty", Types.FloatType.get()),
|
||||
Types.Field.get(12, false, "long", Types.FloatType.get())), false)
|
||||
),
|
||||
Types.Field.get(13, true, "add1", Types.IntType.get()),
|
||||
Types.Field.get(14, true, "addStruct",
|
||||
Types.RecordType.get(
|
||||
Types.Field.get(15, false, "nest1", Types.BooleanType.get()),
|
||||
Types.Field.get(16, true, "nest2", Types.BooleanType.get())))
|
||||
);
|
||||
Assertions.assertEquals(result.getRecord(), checkedRecord);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal.schema.utils;
|
||||
|
||||
import org.apache.hudi.internal.schema.InternalSchema;
|
||||
import org.apache.hudi.internal.schema.InternalSchemaBuilder;
|
||||
import org.apache.hudi.internal.schema.Types;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestInternalSchemaUtils {
|
||||
@Test
|
||||
public void testPruneSchema() {
|
||||
Types.RecordType record = getSimpleRecordType();
|
||||
InternalSchema originSchema = new InternalSchema(record.fields());
|
||||
List<Integer> prunedCols = new ArrayList<>();
|
||||
prunedCols.add(4);
|
||||
prunedCols.add(3);
|
||||
prunedCols.add(0);
|
||||
prunedCols.add(2);
|
||||
InternalSchema prunedSchema = InternalSchemaUtils.pruneInternalSchemaByID(originSchema, prunedCols, null);
|
||||
InternalSchema checkedSchema = new InternalSchema(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "bool", Types.BooleanType.get()),
|
||||
Types.Field.get(2, "long", Types.LongType.get()),
|
||||
Types.Field.get(3, "float", Types.FloatType.get()),
|
||||
Types.Field.get(4, "double", Types.DoubleType.get())
|
||||
}));
|
||||
Assertions.assertEquals(prunedSchema, checkedSchema);
|
||||
|
||||
// nest schema
|
||||
Types.RecordType nestRecord = getNestRecordType();
|
||||
InternalSchema originNestSchema = new InternalSchema(nestRecord.fields());
|
||||
List<Integer> prunedNestCols = new ArrayList<>();
|
||||
prunedNestCols.add(0);
|
||||
prunedNestCols.add(1);
|
||||
prunedNestCols.add(5);
|
||||
prunedNestCols.add(11);
|
||||
InternalSchema prunedNestSchema = InternalSchemaUtils.pruneInternalSchemaByID(originNestSchema, prunedNestCols, null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInternalSchemaVisitor() {
|
||||
Types.RecordType nestRecord = getNestRecordType();
|
||||
Map<String, Integer> result = InternalSchemaBuilder.getBuilder().buildNameToId(nestRecord);
|
||||
Assertions.assertEquals(result.size(), 12);
|
||||
Assertions.assertEquals(result.get("locations.value.long"), 11);
|
||||
Assertions.assertEquals(result.get("locations.value.lat"), 10);
|
||||
Assertions.assertEquals(result.get("locations.value"), 9);
|
||||
Assertions.assertEquals(result.get("locations.key"), 8);
|
||||
Assertions.assertEquals(result.get("doubles.element"), 7);
|
||||
|
||||
Types.RecordType simpleRecord = getSimpleRecordType();
|
||||
Map<String, Integer> result1 = InternalSchemaBuilder.getBuilder().buildNameToId(simpleRecord);
|
||||
Assertions.assertEquals(result1.size(), 5);
|
||||
Assertions.assertEquals(result1.get("double"), 4);
|
||||
}
|
||||
|
||||
public Types.RecordType getNestRecordType() {
|
||||
return Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()),
|
||||
Types.Field.get(1, true, "data", Types.StringType.get()),
|
||||
Types.Field.get(2, true, "preferences",
|
||||
Types.RecordType.get(Types.Field.get(5, false, "feature1",
|
||||
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
|
||||
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
|
||||
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
|
||||
Types.RecordType.get(Types.Field.get(10, false, "lat", Types.FloatType.get()), Types.Field.get(11, false, "long", Types.FloatType.get())), false))
|
||||
);
|
||||
}
|
||||
|
||||
public Types.RecordType getSimpleRecordType() {
|
||||
return Types.RecordType.get(Arrays.asList(new Types.Field[] {
|
||||
Types.Field.get(0, "bool", Types.BooleanType.get()),
|
||||
Types.Field.get(1, "int", Types.IntType.get()),
|
||||
Types.Field.get(2, "long", Types.LongType.get()),
|
||||
Types.Field.get(3, "float", Types.FloatType.get()),
|
||||
Types.Field.get(4, "double", Types.DoubleType.get())
|
||||
}));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user