1
0

[RFC-33] [HUDI-2429][Stacked on HUDI-2560] Support full Schema evolution for Spark (#4910)

* [HUDI-2560] introduce id_based schema to support full schema evolution.

* add test for FileBasedInternalSchemaStorageManger and rebase code

* add support for change column type and fix some test case

* fix some bugs encountered in the production env and delete useless code

* fix test error

* rebase code

* fixed some nested schema change bugs

* [HUDI-2429][Stacked On HUDI-2560]Support full schema evolution for spark

* [use dummyInternalSchema instead of null]

* add support for spark3.1.x

* remove support for spark3.1.x , sicne some compile fail

* support spark3.1.x

* rebase and prepare solve all comments

* address all comments

* rebase code

* fixed the count(*) bug

* try to get internalSchema by parser commit file/history file directly, not use metaclient which is time cost
address some comments

* fixed all comments

* fix new comments

* rebase code,fix UT failed

* fixed mistake

* rebase code ,fixed new comments

* rebase code , and prepare for address new comments

* address commits

* address new comments

* fix new issues

* control fallback original write logical
This commit is contained in:
xiarixiaoyao
2022-04-02 04:20:24 +08:00
committed by GitHub
parent 9275b8fc7e
commit 444ff496a4
89 changed files with 10352 additions and 106 deletions

View File

@@ -0,0 +1,117 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.internal.schema;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Assertions;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TreeMap;
public class TestSerDeHelper {
@Test
public void testComplexSchema2Json() {
InternalSchema internalSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(7, false, "feature1",
Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))),
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
);
// test schema2json
String result = SerDeHelper.toJson(internalSchema);
InternalSchema convertedSchema = SerDeHelper.fromJson(result).get();
Assertions.assertEquals(internalSchema, convertedSchema);
// test schemas2json
String results = SerDeHelper.toJson(Arrays.asList(internalSchema));
TreeMap<Long, InternalSchema> convertedSchemas = SerDeHelper.parseSchemas(results);
Assertions.assertEquals(1, convertedSchemas.size());
}
@Test
public void testPrimitive2Json() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "bool", Types.BooleanType.get()),
Types.Field.get(1, "int", Types.IntType.get()),
Types.Field.get(2, "long", Types.LongType.get()),
Types.Field.get(3, "float", Types.FloatType.get()),
Types.Field.get(4, "double", Types.DoubleType.get()),
Types.Field.get(5, "date", Types.DateType.get()),
Types.Field.get(6, "time", Types.TimeType.get()),
Types.Field.get(7, "timestamp", Types.TimestampType.get()),
Types.Field.get(8, "string", Types.StringType.get()),
Types.Field.get(9, "uuid", Types.UUIDType.get()),
Types.Field.get(10, "fixed", Types.FixedType.getFixed(10)),
Types.Field.get(11, "binary", Types.BinaryType.get()),
Types.Field.get(12, "decimal", Types.DecimalType.get(10, 2))
}));
InternalSchema internalSchema = new InternalSchema(record.fields());
String result = SerDeHelper.toJson(internalSchema);
InternalSchema convertedSchema = SerDeHelper.fromJson(result).get();
Assertions.assertEquals(internalSchema, convertedSchema);
}
@Test
public void testSearchSchema() {
List schemas = new ArrayList<>();
for (int i = 0; i < 100; i++) {
schemas.add(new InternalSchema(i * 10,
Arrays.asList(Types.Field.get(1, true, "schema" + i * 10, Types.LongType.get()))));
}
Assertions.assertEquals(InternalSchemaUtils.searchSchema(0, schemas).getRecord().fields().get(0),
Types.Field.get(1, true, "schema" + 0, Types.LongType.get()));
Assertions.assertEquals(InternalSchemaUtils.searchSchema(9, schemas).getRecord().fields().get(0),
Types.Field.get(1, true, "schema" + 0, Types.LongType.get()));
Assertions.assertEquals(InternalSchemaUtils.searchSchema(99, schemas).getRecord().fields().get(0),
Types.Field.get(1, true, "schema" + 90, Types.LongType.get()));
Assertions.assertEquals(InternalSchemaUtils.searchSchema(9999, schemas).getRecord().fields().get(0),
Types.Field.get(1, true, "schema" + 990, Types.LongType.get()));
}
@Test
public void testInheritSchemas() {
List schemas = new ArrayList<>();
for (int i = 0; i < 2; i++) {
schemas.add(new InternalSchema(i,
Arrays.asList(Types.Field.get(1, true, "schema" + i, Types.LongType.get()))));
}
String oldSchemas = SerDeHelper.toJson(schemas);
InternalSchema newSchema = new InternalSchema(3,
Arrays.asList(Types.Field.get(1, true, "schema" + 3, Types.LongType.get())));
String finalResult = SerDeHelper.inheritSchemas(newSchema, oldSchemas);
// convert back
Assertions.assertEquals(SerDeHelper.parseSchemas(finalResult).size(), 3);
}
}

View File

@@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.internal.schema.action;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.utils.SchemaChangeUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
public class TestMergeSchema {
@Test
public void testPrimitiveMerge() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "col1", Types.BooleanType.get()),
Types.Field.get(1, "col2", Types.IntType.get()),
Types.Field.get(2, "col3", Types.LongType.get()),
Types.Field.get(3, "col4", Types.FloatType.get())}));
InternalSchema oldSchema = new InternalSchema(record.fields());
// add c1 after 'col1', and c2 before 'col3'
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1");
addChange.addPositionChange("c1", "col1", "after");
addChange.addColumns("c2", Types.IntType.get(), "add c2 before col3");
addChange.addPositionChange("c2", "col3", "before");
InternalSchema newAddSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(newAddSchema);
deleteChange.deleteColumn("col1");
deleteChange.deleteColumn("col3");
InternalSchema newDeleteSchema = SchemaChangeUtils.applyTableChanges2Schema(newAddSchema, deleteChange);
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(newDeleteSchema);
updateChange.updateColumnType("col2", Types.LongType.get())
.updateColumnComment("col2", "alter col2 comments")
.renameColumn("col2", "colx").addPositionChange("col2",
"col4", "after");
InternalSchema updateSchema = SchemaChangeUtils.applyTableChanges2Schema(newDeleteSchema, updateChange);
// add col1 again
TableChanges.ColumnAddChange addChange1 = TableChanges.ColumnAddChange.get(updateSchema);
addChange1.addColumns("col1", Types.BooleanType.get(), "add new col1");
InternalSchema finalSchema = SchemaChangeUtils.applyTableChanges2Schema(updateSchema, addChange1);
// merge schema by using columnType from query schema
InternalSchema mergeSchema = new InternalSchemaMerger(oldSchema, finalSchema, true, false).mergeSchema();
InternalSchema checkedSchema = new InternalSchema(Arrays.asList(new Types.Field[] {
Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"),
Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"),
Types.Field.get(3, true, "col4", Types.FloatType.get()),
Types.Field.get(1, true, "col2", Types.LongType.get(), "alter col2 comments"),
Types.Field.get(6, true, "col1suffix", Types.BooleanType.get(), "add new col1")
}));
Assertions.assertEquals(mergeSchema, checkedSchema);
// merge schema by using columnType from file schema
InternalSchema mergeSchema1 = new InternalSchemaMerger(oldSchema, finalSchema, true, true).mergeSchema();
InternalSchema checkedSchema1 = new InternalSchema(Arrays.asList(new Types.Field[] {
Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"),
Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"),
Types.Field.get(3, true, "col4", Types.FloatType.get()),
Types.Field.get(1, true, "col2", Types.IntType.get(), "alter col2 comments"),
Types.Field.get(6, true, "col1suffix", Types.BooleanType.get(), "add new col1")
}));
Assertions.assertEquals(mergeSchema1, checkedSchema1);
}
}

View File

@@ -0,0 +1,229 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.internal.schema.action;
import org.apache.hudi.internal.schema.HoodieSchemaException;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.utils.SchemaChangeUtils;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Assertions;
import java.util.Arrays;
public class TestTableChanges {
@Test
public void testPrimitiveAdd() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "col1", Types.BooleanType.get()),
Types.Field.get(1, "col2", Types.IntType.get()),
Types.Field.get(2, "col3", Types.LongType.get()),
Types.Field.get(3, "col4", Types.FloatType.get())}));
Types.RecordType checkRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "col1", Types.BooleanType.get()),
Types.Field.get(4, true, "c1", Types.BooleanType.get(), "add c1 after col1"),
Types.Field.get(1, "col2", Types.IntType.get()),
Types.Field.get(5, true, "c2", Types.IntType.get(), "add c2 before col3"),
Types.Field.get(2, "col3", Types.LongType.get()),
Types.Field.get(3, "col4", Types.FloatType.get())}));
InternalSchema oldSchema = new InternalSchema(record.fields());
// add c1 after 'col1', and c2 before 'col3'
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1");
// check repeated add.
Assertions.assertThrows(HoodieSchemaException.class, () -> addChange.addColumns("c1", Types.BooleanType.get(), "add c1 after col1"));
addChange.addPositionChange("c1", "col1", "after");
addChange.addColumns("c2", Types.IntType.get(), "add c2 before col3");
addChange.addPositionChange("c2", "col3", "before");
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
Assertions.assertEquals(newSchema.getRecord(), checkRecord);
}
@Test
public void testNestAdd() {
InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(7, false, "feature1",
Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))),
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
);
TableChanges.ColumnAddChange addChange = TableChanges.ColumnAddChange.get(oldSchema);
// add c1 first
addChange.addColumns("c1", Types.StringType.get(), "add c1 first");
addChange.addPositionChange("c1", "id", "before");
//add preferences.cx before preferences.feature2
addChange.addColumns("preferences", "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2");
// check repeated add.
Assertions.assertThrows(HoodieSchemaException.class, () -> addChange.addColumns("preferences", "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"));
addChange.addPositionChange("preferences.cx", "preferences.feature2", "before");
// add locations.value.lax before locations.value.long
addChange.addColumns("locations.value", "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long");
addChange.addPositionChange("locations.value.lax", "locations.value.long", "before");
//
// add points.element.z after points.element.y
addChange.addColumns("points.element", "z", Types.BooleanType.get(), "add points.element.z after points.element.y");
addChange.addPositionChange("points.element.z", "points.element.y", "after");
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, addChange);
InternalSchema checkedSchema = new InternalSchema(
Types.Field.get(19, true, "c1", Types.StringType.get(), "add c1 first"),
Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(7, false, "feature1", Types.BooleanType.get()),
Types.Field.get(20, true, "cx", Types.BooleanType.get(), "add preferences.cx before preferences.feature2"),
Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()),
Types.Field.get(21, true, "lax", Types.BooleanType.get(), "add locations.value.lax before locations.value.long"),
Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()),
Types.Field.get(15, false, "y", Types.LongType.get()),
Types.Field.get(22, true, "z", Types.BooleanType.get(), "add points.element.z after points.element.y")))),
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
);
Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord());
}
@Test
public void testPrimitiveDelete() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "col1", Types.BooleanType.get()),
Types.Field.get(1, "col2", Types.IntType.get()),
Types.Field.get(2, "col3", Types.LongType.get()),
Types.Field.get(3, "col4", Types.FloatType.get())}));
InternalSchema oldSchema = new InternalSchema(record.fields());
TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(oldSchema);
deleteChange.deleteColumn("col1");
// check repeated delete.
// deletechange can handle deleting the same column multiple times, only keep one operation.
deleteChange.deleteColumn("col1");
deleteChange.deleteColumn("col3");
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, deleteChange);
Types.RecordType checkRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(1, "col2", Types.IntType.get()),
Types.Field.get(3, "col4", Types.FloatType.get())}));
Assertions.assertEquals(newSchema.getRecord(), checkRecord);
}
@Test
public void testNestDelete() {
InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(5, false, "feature1",
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(9, false, "lat", Types.FloatType.get()), Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()), Types.Field.get(13, false, "y", Types.LongType.get()))))
);
TableChanges.ColumnDeleteChange deleteChange = TableChanges.ColumnDeleteChange.get(oldSchema);
deleteChange.deleteColumn("data");
deleteChange.deleteColumn("preferences.feature2");
deleteChange.deleteColumn("preferences.feature2");
deleteChange.deleteColumn("locations.value.lat");
deleteChange.deleteColumn("points.element.y");
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, deleteChange);
InternalSchema checkedSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(5, false, "feature1",
Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()))))
);
Assertions.assertEquals(newSchema.getRecord(), checkedSchema.getRecord());
}
@Test
public void testPrimitiveUpdate() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "col1", Types.BooleanType.get()),
Types.Field.get(1, "col2", Types.IntType.get()),
Types.Field.get(2, "col3", Types.LongType.get()),
Types.Field.get(3, "col4", Types.FloatType.get())}));
InternalSchema oldSchema = new InternalSchema(record.fields());
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(oldSchema);
updateChange.updateColumnType("col2", Types.LongType.get())
.updateColumnComment("col2", "alter col2 comments")
.renameColumn("col2", "colx").addPositionChange("col2", "col4", "after");
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, updateChange);
Types.RecordType checkedRecord = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "col1", Types.BooleanType.get()),
Types.Field.get(2, "col3", Types.LongType.get()),
Types.Field.get(3, "col4", Types.FloatType.get()),
Types.Field.get(1, true, "colx", Types.LongType.get(), "alter col2 comments")}));
Assertions.assertEquals(newSchema.getRecord(), checkedRecord);
}
@Test
public void testNestUpdate() {
InternalSchema oldSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(5, false, "feature1",
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(9, false, "lat", Types.FloatType.get()), Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
Types.RecordType.get(Types.Field.get(12, false, "x", Types.LongType.get()), Types.Field.get(13, false, "y", Types.LongType.get()))))
);
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(oldSchema);
updateChange
.updateColumnNullability("id", true)
.renameColumn("id", "idx")
.addPositionChange("data", "points", "after");
updateChange
.updateColumnComment("preferences.feature1", "add feature1 comment")
.renameColumn("preferences.feature1", "f1")
.addPositionChange("preferences.feature1", "preferences.feature1", "first");
updateChange.updateColumnComment("locations.value.lat", "add lat comment")
.renameColumn("locations.value.lat", "lax")
.addPositionChange("locations.value.lat", "locations.value.lat", "first");
updateChange.renameColumn("points.element.x", "z")
.addPositionChange("points.element.x", "points.element.y", "after");
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(oldSchema, updateChange);
InternalSchema checkSchema = new InternalSchema(Types.Field.get(0, true, "idx", Types.IntType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(5, false, "f1",
Types.BooleanType.get(), "add feature1 comment"), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(7, 8, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(9, false, "lax", Types.FloatType.get(), "add lat comment"), Types.Field.get(10, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(11, true,
Types.RecordType.get(Types.Field.get(13, false, "y", Types.LongType.get()), Types.Field.get(12, false, "z", Types.LongType.get())))),
Types.Field.get(1, true, "data", Types.StringType.get())
);
Assertions.assertEquals(newSchema.getRecord(), checkSchema.getRecord());
}
}

View File

@@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.internal.schema.io;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Tests {@link FileBasedInternalSchemaStorageManager}.
*/
public class TestFileBasedInternalSchemaStorageManager extends HoodieCommonTestHarness {
private HoodieActiveTimeline timeline;
@BeforeEach
public void setUp() throws Exception {
initMetaClient();
}
@Test
public void testPersistAndReadHistorySchemaStr() throws IOException {
timeline = new HoodieActiveTimeline(metaClient);
FileBasedInternalSchemaStorageManager fm = new FileBasedInternalSchemaStorageManager(metaClient);
InternalSchema currentSchema = getSimpleSchema();
currentSchema.setSchemaId(0L);
// save first schema.
fm.persistHistorySchemaStr("0000", SerDeHelper.inheritSchemas(currentSchema, ""));
// Simulate commit.
simulateCommit("0000");
metaClient.reloadActiveTimeline();
// try to read schema
InternalSchema readSchema = fm.getSchemaByKey("0").get();
assertEquals(currentSchema, readSchema);
// save history schema again
InternalSchema secondSchema = getSimpleSchema();
secondSchema.setSchemaId(1L);
fm.persistHistorySchemaStr("0001", SerDeHelper.inheritSchemas(secondSchema, fm.getHistorySchemaStr()));
// Simulate commit.
simulateCommit("0001");
metaClient.reloadActiveTimeline();
// try to read schema
assertEquals(secondSchema, fm.getSchemaByKey("1").get());
// test write failed and residual file clean.
InternalSchema thirdSchema = getSimpleSchema();
thirdSchema.setSchemaId(2L);
fm.persistHistorySchemaStr("0002", SerDeHelper.inheritSchemas(thirdSchema, fm.getHistorySchemaStr()));
// do not simulate commit "0002", so current save file will be residual files.
// try 4st persist
InternalSchema lastSchema = getSimpleSchema();
lastSchema.setSchemaId(3L);
fm.persistHistorySchemaStr("0004", SerDeHelper.inheritSchemas(lastSchema, fm.getHistorySchemaStr()));
simulateCommit("0004");
metaClient.reloadActiveTimeline();
// now the residual file created by 3st persist should be removed.
File f = new File(metaClient.getSchemaFolderName() + File.separator + "0002.schemacommit");
assertTrue(!f.exists());
assertEquals(lastSchema, fm.getSchemaByKey("3").get());
}
private void simulateCommit(String commitTime) {
if (timeline == null) {
timeline = new HoodieActiveTimeline(metaClient);
}
HoodieInstant instant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMMIT_ACTION, commitTime);
timeline.createNewInstant(instant);
timeline.transitionRequestedToInflight(instant, Option.empty());
timeline.saveAsComplete(new HoodieInstant(true, instant.getAction(), instant.getTimestamp()),
Option.empty());
}
private InternalSchema getSimpleSchema() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "bool", Types.BooleanType.get()),
Types.Field.get(1, "int", Types.IntType.get()),
}));
return new InternalSchema(record.fields());
}
}

View File

@@ -0,0 +1,422 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.internal.schema.utils;
import org.apache.avro.JsonProperties;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.InternalSchemaBuilder;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.action.TableChanges;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Assertions;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
public class TestAvroSchemaEvolutionUtils {
@Test
public void testPrimitiveTypes() {
Schema[] avroPrimitives = new Schema[] {
Schema.create(Schema.Type.BOOLEAN),
Schema.create(Schema.Type.INT),
Schema.create(Schema.Type.LONG),
Schema.create(Schema.Type.FLOAT),
Schema.create(Schema.Type.DOUBLE),
LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)),
LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)),
LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)),
Schema.create(Schema.Type.STRING),
LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)),
Schema.createFixed("fixed_12", null, null, 12),
Schema.create(Schema.Type.BYTES),
LogicalTypes.decimal(9, 4).addToSchema(Schema.createFixed("decimal_9_4", null, null, 4))};
Type[] primitiveTypes = new Type[] {
Types.BooleanType.get(),
Types.IntType.get(),
Types.LongType.get(),
Types.FloatType.get(),
Types.DoubleType.get(),
Types.DateType.get(),
Types.TimeType.get(),
Types.TimestampType.get(),
Types.StringType.get(),
Types.UUIDType.get(),
Types.FixedType.getFixed(12),
Types.BinaryType.get(),
Types.DecimalType.get(9, 4)
};
for (int i = 0; i < primitiveTypes.length; i++) {
Type convertPrimitiveResult = AvroInternalSchemaConverter.convertToField(avroPrimitives[i]);
Assertions.assertEquals(convertPrimitiveResult, primitiveTypes[i]);
Schema convertResult = AvroInternalSchemaConverter.convert(primitiveTypes[i], "t1");
Assertions.assertEquals(convertResult, avroPrimitives[i]);
}
}
@Test
public void testRecordAndPrimitiveTypes() {
Types.RecordType record = Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "bool", Types.BooleanType.get()),
Types.Field.get(1, "int", Types.IntType.get()),
Types.Field.get(2, "long", Types.LongType.get()),
Types.Field.get(3, "float", Types.FloatType.get()),
Types.Field.get(4, "double", Types.DoubleType.get()),
Types.Field.get(5, "date", Types.DateType.get()),
Types.Field.get(6, "time", Types.TimeType.get()),
Types.Field.get(7, "timestamp", Types.TimestampType.get()),
Types.Field.get(8, "string", Types.StringType.get()),
Types.Field.get(9, "uuid", Types.UUIDType.get()),
Types.Field.get(10, "fixed", Types.FixedType.getFixed(10)),
Types.Field.get(11, "binary", Types.BinaryType.get()),
Types.Field.get(12, "decimal", Types.DecimalType.get(10, 2))
}));
Schema schema = create("t1",
new Schema.Field("bool", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BOOLEAN)), null, JsonProperties.NULL_VALUE),
new Schema.Field("int", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.INT)), null, JsonProperties.NULL_VALUE),
new Schema.Field("long", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.LONG)), null, JsonProperties.NULL_VALUE),
new Schema.Field("float", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.FLOAT)), null, JsonProperties.NULL_VALUE),
new Schema.Field("double", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.DOUBLE)), null, JsonProperties.NULL_VALUE),
new Schema.Field("date", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), null, JsonProperties.NULL_VALUE),
new Schema.Field("time", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG))), null, JsonProperties.NULL_VALUE),
new Schema.Field("timestamp", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))), null, JsonProperties.NULL_VALUE),
new Schema.Field("string", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.STRING)), null, JsonProperties.NULL_VALUE),
new Schema.Field("uuid", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16))), null, JsonProperties.NULL_VALUE),
new Schema.Field("fixed", AvroInternalSchemaConverter.nullableSchema(Schema.createFixed("fixed_10", null, null, 10)), null, JsonProperties.NULL_VALUE),
new Schema.Field("binary", AvroInternalSchemaConverter.nullableSchema(Schema.create(Schema.Type.BYTES)), null, JsonProperties.NULL_VALUE),
new Schema.Field("decimal", AvroInternalSchemaConverter.nullableSchema(LogicalTypes.decimal(10, 2)
.addToSchema(Schema.createFixed("decimal_10_2", null, null, 5))), null, JsonProperties.NULL_VALUE));
Schema convertedSchema = AvroInternalSchemaConverter.convert(record, "t1");
Assertions.assertEquals(convertedSchema, schema);
Types.RecordType convertedRecord = AvroInternalSchemaConverter.convert(schema).getRecord();
Assertions.assertEquals(convertedRecord, record);
}
private Schema create(String name, Schema.Field... fields) {
return Schema.createRecord(name, null, null, false, Arrays.asList(fields));
}
@Test
public void testArrayType() {
Type arrayNestRecordType = Types.ArrayType.get(1, false,
Types.RecordType.get(Arrays.asList(Types.Field.get(2, false, "a", Types.FloatType.get()),
Types.Field.get(3, false, "b", Types.FloatType.get()))));
Schema schema = SchemaBuilder.array().items(create("t1",
new Schema.Field("a", Schema.create(Schema.Type.FLOAT), null, null),
new Schema.Field("b", Schema.create(Schema.Type.FLOAT), null, null)));
Schema convertedSchema = AvroInternalSchemaConverter.convert(arrayNestRecordType, "t1");
Assertions.assertEquals(convertedSchema, schema);
Types.ArrayType convertedRecord = (Types.ArrayType) AvroInternalSchemaConverter.convertToField(schema);
Assertions.assertEquals(convertedRecord, arrayNestRecordType);
}
@Test
public void testComplexConvert() {
String schemaStr = "{\"type\":\"record\",\"name\":\"newTableName\",\"fields\":[{\"name\":\"id\",\"type\":\"int\"},{\"name\":\"data\","
+ "\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"preferences\",\"type\":[\"null\","
+ "{\"type\":\"record\",\"name\":\"newTableName_preferences\",\"fields\":[{\"name\":\"feature1\","
+ "\"type\":\"boolean\"},{\"name\":\"feature2\",\"type\":[\"null\",\"boolean\"],\"default\":null}]}],"
+ "\"default\":null},{\"name\":\"locations\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"record\","
+ "\"name\":\"newTableName_locations\",\"fields\":[{\"name\":\"lat\",\"type\":\"float\"},{\"name\":\"long\","
+ "\"type\":\"float\"}]}}},{\"name\":\"points\",\"type\":[\"null\",{\"type\":\"array\",\"items\":[\"null\","
+ "{\"type\":\"record\",\"name\":\"newTableName_points\",\"fields\":[{\"name\":\"x\",\"type\":\"long\"},"
+ "{\"name\":\"y\",\"type\":\"long\"}]}]}],\"default\":null},{\"name\":\"doubles\",\"type\":{\"type\":\"array\",\"items\":\"double\"}},"
+ "{\"name\":\"properties\",\"type\":[\"null\",{\"type\":\"map\",\"values\":[\"null\",\"string\"]}],\"default\":null}]}";
Schema schema = new Schema.Parser().parse(schemaStr);
InternalSchema internalSchema = new InternalSchema(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(7, false, "feature1",
Types.BooleanType.get()), Types.Field.get(8, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(11, false, "lat", Types.FloatType.get()), Types.Field.get(12, false, "long", Types.FloatType.get())), false)),
Types.Field.get(4, true, "points", Types.ArrayType.get(13, true,
Types.RecordType.get(Types.Field.get(14, false, "x", Types.LongType.get()), Types.Field.get(15, false, "y", Types.LongType.get())))),
Types.Field.get(5, false,"doubles", Types.ArrayType.get(16, false, Types.DoubleType.get())),
Types.Field.get(6, true, "properties", Types.MapType.get(17, 18, Types.StringType.get(), Types.StringType.get()))
);
Type convertRecord = AvroInternalSchemaConverter.convert(schema).getRecord();
Assertions.assertEquals(convertRecord, internalSchema.getRecord());
Assertions.assertEquals(schema, AvroInternalSchemaConverter.convert(internalSchema, "newTableName"));
}
@Test
public void testRefreshNewId() {
Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(4, false, "feature1",
Types.BooleanType.get()), Types.Field.get(5, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false, "locations", Types.MapType.get(6, 7, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(8, false, "lat", Types.FloatType.get()), Types.Field.get(9, false, "long", Types.FloatType.get())), false))
);
AtomicInteger newId = new AtomicInteger(100);
Types.RecordType recordWithNewId = (Types.RecordType) InternalSchemaBuilder.getBuilder().refreshNewId(record, newId);
Types.RecordType newRecord = Types.RecordType.get(Types.Field.get(100, false, "id", Types.IntType.get()),
Types.Field.get(101, true, "data", Types.StringType.get()),
Types.Field.get(102, true, "preferences",
Types.RecordType.get(Types.Field.get(104, false, "feature1",
Types.BooleanType.get()), Types.Field.get(105, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(103, false, "locations", Types.MapType.get(106, 107, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(108, false, "lat", Types.FloatType.get()), Types.Field.get(109, false, "long", Types.FloatType.get())), false))
);
Assertions.assertEquals(newRecord, recordWithNewId);
}
/**
* test record data type changes.
* int => long/float/double/string
* long => float/double/string
* float => double/String
* double => String/Decimal
* Decimal => Decimal/String
* String => date/decimal
* date => String
*/
@Test
public void testReWriteRecordWithTypeChanged() {
Schema avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"h0_record\",\"namespace\":\"hoodie.h0\",\"fields\""
+ ":[{\"name\":\"id\",\"type\":[\"null\",\"int\"],\"default\":null},"
+ "{\"name\":\"comb\",\"type\":[\"null\",\"int\"],\"default\":null},"
+ "{\"name\":\"com1\",\"type\":[\"null\",\"int\"],\"default\":null},"
+ "{\"name\":\"col0\",\"type\":[\"null\",\"int\"],\"default\":null},"
+ "{\"name\":\"col1\",\"type\":[\"null\",\"long\"],\"default\":null},"
+ "{\"name\":\"col11\",\"type\":[\"null\",\"long\"],\"default\":null},"
+ "{\"name\":\"col12\",\"type\":[\"null\",\"long\"],\"default\":null},"
+ "{\"name\":\"col2\",\"type\":[\"null\",\"float\"],\"default\":null},"
+ "{\"name\":\"col21\",\"type\":[\"null\",\"float\"],\"default\":null},"
+ "{\"name\":\"col3\",\"type\":[\"null\",\"double\"],\"default\":null},"
+ "{\"name\":\"col31\",\"type\":[\"null\",\"double\"],\"default\":null},"
+ "{\"name\":\"col4\",\"type\":[\"null\",{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"hoodie.h0.h0_record.col4\","
+ "\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":4}],\"default\":null},"
+ "{\"name\":\"col41\",\"type\":[\"null\",{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"hoodie.h0.h0_record.col41\","
+ "\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":4}],\"default\":null},"
+ "{\"name\":\"col5\",\"type\":[\"null\",\"string\"],\"default\":null},"
+ "{\"name\":\"col51\",\"type\":[\"null\",\"string\"],\"default\":null},"
+ "{\"name\":\"col6\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null},"
+ "{\"name\":\"col7\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}],\"default\":null},"
+ "{\"name\":\"col8\",\"type\":[\"null\",\"boolean\"],\"default\":null},"
+ "{\"name\":\"col9\",\"type\":[\"null\",\"bytes\"],\"default\":null},{\"name\":\"par\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}],\"default\":null}]}");
// create a test record with avroSchema
GenericData.Record avroRecord = new GenericData.Record(avroSchema);
avroRecord.put("id", 1);
avroRecord.put("comb", 100);
avroRecord.put("com1", -100);
avroRecord.put("col0", 256);
avroRecord.put("col1", 1000L);
avroRecord.put("col11", -100L);
avroRecord.put("col12", 2000L);
avroRecord.put("col2", -5.001f);
avroRecord.put("col21", 5.001f);
avroRecord.put("col3", 12.999d);
avroRecord.put("col31", 9999.999d);
Schema currentDecimalType = avroSchema.getField("col4").schema().getTypes().get(1);
BigDecimal bd = new BigDecimal("123.456").setScale(((LogicalTypes.Decimal) currentDecimalType.getLogicalType()).getScale());
avroRecord.put("col4", HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd, currentDecimalType, currentDecimalType.getLogicalType()));
Schema currentDecimalType1 = avroSchema.getField("col41").schema().getTypes().get(1);
BigDecimal bd1 = new BigDecimal("7890.456").setScale(((LogicalTypes.Decimal) currentDecimalType1.getLogicalType()).getScale());
avroRecord.put("col41", HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd1, currentDecimalType1, currentDecimalType1.getLogicalType()));
avroRecord.put("col5", "2011-01-01");
avroRecord.put("col51", "199.342");
avroRecord.put("col6", 18987);
avroRecord.put("col7", 1640491505000000L);
avroRecord.put("col8", false);
ByteBuffer bb = ByteBuffer.wrap(new byte[] {97, 48, 53});
avroRecord.put("col9", bb);
Assertions.assertEquals(GenericData.get().validate(avroSchema, avroRecord), true);
InternalSchema internalSchema = AvroInternalSchemaConverter.convert(avroSchema);
// do change type operation
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(internalSchema);
updateChange
.updateColumnType("id", Types.LongType.get())
.updateColumnType("comb", Types.FloatType.get())
.updateColumnType("com1", Types.DoubleType.get())
.updateColumnType("col0", Types.StringType.get())
.updateColumnType("col1", Types.FloatType.get())
.updateColumnType("col11", Types.DoubleType.get())
.updateColumnType("col12", Types.StringType.get())
.updateColumnType("col2", Types.DoubleType.get())
.updateColumnType("col21", Types.StringType.get())
.updateColumnType("col3", Types.StringType.get())
.updateColumnType("col31", Types.DecimalType.get(18, 9))
.updateColumnType("col4", Types.DecimalType.get(18, 9))
.updateColumnType("col41", Types.StringType.get())
.updateColumnType("col5", Types.DateType.get())
.updateColumnType("col51", Types.DecimalType.get(18, 9))
.updateColumnType("col6", Types.StringType.get());
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange);
Schema newAvroSchema = AvroInternalSchemaConverter.convert(newSchema, avroSchema.getName());
GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema);
Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newRecord), true);
}
@Test
public void testReWriteNestRecord() {
Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(5, false, "feature1",
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(10, false, "lat", Types.FloatType.get()), Types.Field.get(11, false, "long", Types.FloatType.get())), false))
);
Schema schema = AvroInternalSchemaConverter.convert(record, "test1");
GenericData.Record avroRecord = new GenericData.Record(schema);
GenericData.get().validate(schema, avroRecord);
avroRecord.put("id", 2);
avroRecord.put("data", "xs");
// fill record type
GenericData.Record preferencesRecord = new GenericData.Record(AvroInternalSchemaConverter.convert(record.fieldType("preferences"), "test1_preferences"));
preferencesRecord.put("feature1", false);
preferencesRecord.put("feature2", true);
Assertions.assertEquals(GenericData.get().validate(AvroInternalSchemaConverter.convert(record.fieldType("preferences"), "test1_preferences"), preferencesRecord), true);
avroRecord.put("preferences", preferencesRecord);
// fill mapType
Map<String, GenericData.Record> locations = new HashMap<>();
Schema mapSchema = AvroInternalSchemaConverter.convert(((Types.MapType)record.field("locations").type()).valueType(), "test1_locations");
GenericData.Record locationsValue = new GenericData.Record(mapSchema);
locationsValue.put("lat", 1.2f);
locationsValue.put("long", 1.4f);
GenericData.Record locationsValue1 = new GenericData.Record(mapSchema);
locationsValue1.put("lat", 2.2f);
locationsValue1.put("long", 2.4f);
locations.put("key1", locationsValue);
locations.put("key2", locationsValue1);
avroRecord.put("locations", locations);
List<Double> doubles = new ArrayList<>();
doubles.add(2.0d);
doubles.add(3.0d);
avroRecord.put("doubles", doubles);
// do check
Assertions.assertEquals(GenericData.get().validate(schema, avroRecord), true);
// create newSchema
Types.RecordType newRecord = Types.RecordType.get(
Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
Types.Field.get(5, true, "featurex", Types.BooleanType.get()),
Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
Types.RecordType.get(
Types.Field.get(10, true, "laty", Types.FloatType.get()),
Types.Field.get(11, false, "long", Types.FloatType.get())), false)
)
);
Schema newAvroSchema = AvroInternalSchemaConverter.convert(newRecord, schema.getName());
GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema);
// test the correctly of rewrite
Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newAvroRecord), true);
}
@Test
public void testEvolutionSchemaFromNewAvroSchema() {
Types.RecordType oldRecord = Types.RecordType.get(
Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
Types.Field.get(6, true, "featurex", Types.BooleanType.get()),
Types.Field.get(7, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false,"doubles", Types.ArrayType.get(8, false, Types.DoubleType.get())),
Types.Field.get(4, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
Types.RecordType.get(
Types.Field.get(11, false, "laty", Types.FloatType.get()),
Types.Field.get(12, false, "long", Types.FloatType.get())), false)
)
);
InternalSchema oldSchema = new InternalSchema(oldRecord.fields());
Types.RecordType evolvedRecord = Types.RecordType.get(
Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
Types.Field.get(5, true, "featurex", Types.BooleanType.get()),
Types.Field.get(6, true, "feature2", Types.BooleanType.get()),
Types.Field.get(5, true, "feature3", Types.BooleanType.get()))),
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
Types.RecordType.get(
Types.Field.get(10, false, "laty", Types.FloatType.get()),
Types.Field.get(11, false, "long", Types.FloatType.get())), false)
),
Types.Field.get(0, false, "add1", Types.IntType.get()),
Types.Field.get(2, true, "addStruct",
Types.RecordType.get(
Types.Field.get(5, false, "nest1", Types.BooleanType.get()),
Types.Field.get(5, true, "nest2", Types.BooleanType.get())))
);
evolvedRecord = (Types.RecordType)InternalSchemaBuilder.getBuilder().refreshNewId(evolvedRecord, new AtomicInteger(0));
Schema evolvedAvroSchema = AvroInternalSchemaConverter.convert(evolvedRecord, "test1");
InternalSchema result = AvroSchemaEvolutionUtils.evolveSchemaFromNewAvroSchema(evolvedAvroSchema, oldSchema);
Types.RecordType checkedRecord = Types.RecordType.get(
Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(
Types.Field.get(5, false, "feature1", Types.BooleanType.get()),
Types.Field.get(6, true, "featurex", Types.BooleanType.get()),
Types.Field.get(7, true, "feature2", Types.BooleanType.get()),
Types.Field.get(17, true, "feature3", Types.BooleanType.get()))),
Types.Field.get(3, false,"doubles", Types.ArrayType.get(8, false, Types.DoubleType.get())),
Types.Field.get(4, false, "locations", Types.MapType.get(9, 10, Types.StringType.get(),
Types.RecordType.get(
Types.Field.get(11, false, "laty", Types.FloatType.get()),
Types.Field.get(12, false, "long", Types.FloatType.get())), false)
),
Types.Field.get(13, true, "add1", Types.IntType.get()),
Types.Field.get(14, true, "addStruct",
Types.RecordType.get(
Types.Field.get(15, false, "nest1", Types.BooleanType.get()),
Types.Field.get(16, true, "nest2", Types.BooleanType.get())))
);
Assertions.assertEquals(result.getRecord(), checkedRecord);
}
}

View File

@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.internal.schema.utils;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.InternalSchemaBuilder;
import org.apache.hudi.internal.schema.Types;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Assertions;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
public class TestInternalSchemaUtils {
@Test
public void testPruneSchema() {
Types.RecordType record = getSimpleRecordType();
InternalSchema originSchema = new InternalSchema(record.fields());
List<Integer> prunedCols = new ArrayList<>();
prunedCols.add(4);
prunedCols.add(3);
prunedCols.add(0);
prunedCols.add(2);
InternalSchema prunedSchema = InternalSchemaUtils.pruneInternalSchemaByID(originSchema, prunedCols, null);
InternalSchema checkedSchema = new InternalSchema(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "bool", Types.BooleanType.get()),
Types.Field.get(2, "long", Types.LongType.get()),
Types.Field.get(3, "float", Types.FloatType.get()),
Types.Field.get(4, "double", Types.DoubleType.get())
}));
Assertions.assertEquals(prunedSchema, checkedSchema);
// nest schema
Types.RecordType nestRecord = getNestRecordType();
InternalSchema originNestSchema = new InternalSchema(nestRecord.fields());
List<Integer> prunedNestCols = new ArrayList<>();
prunedNestCols.add(0);
prunedNestCols.add(1);
prunedNestCols.add(5);
prunedNestCols.add(11);
InternalSchema prunedNestSchema = InternalSchemaUtils.pruneInternalSchemaByID(originNestSchema, prunedNestCols, null);
}
@Test
public void testInternalSchemaVisitor() {
Types.RecordType nestRecord = getNestRecordType();
Map<String, Integer> result = InternalSchemaBuilder.getBuilder().buildNameToId(nestRecord);
Assertions.assertEquals(result.size(), 12);
Assertions.assertEquals(result.get("locations.value.long"), 11);
Assertions.assertEquals(result.get("locations.value.lat"), 10);
Assertions.assertEquals(result.get("locations.value"), 9);
Assertions.assertEquals(result.get("locations.key"), 8);
Assertions.assertEquals(result.get("doubles.element"), 7);
Types.RecordType simpleRecord = getSimpleRecordType();
Map<String, Integer> result1 = InternalSchemaBuilder.getBuilder().buildNameToId(simpleRecord);
Assertions.assertEquals(result1.size(), 5);
Assertions.assertEquals(result1.get("double"), 4);
}
public Types.RecordType getNestRecordType() {
return Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()),
Types.Field.get(1, true, "data", Types.StringType.get()),
Types.Field.get(2, true, "preferences",
Types.RecordType.get(Types.Field.get(5, false, "feature1",
Types.BooleanType.get()), Types.Field.get(6, true, "feature2", Types.BooleanType.get()))),
Types.Field.get(3, false,"doubles", Types.ArrayType.get(7, false, Types.DoubleType.get())),
Types.Field.get(4, false, "locations", Types.MapType.get(8, 9, Types.StringType.get(),
Types.RecordType.get(Types.Field.get(10, false, "lat", Types.FloatType.get()), Types.Field.get(11, false, "long", Types.FloatType.get())), false))
);
}
public Types.RecordType getSimpleRecordType() {
return Types.RecordType.get(Arrays.asList(new Types.Field[] {
Types.Field.get(0, "bool", Types.BooleanType.get()),
Types.Field.get(1, "int", Types.IntType.get()),
Types.Field.get(2, "long", Types.LongType.get()),
Types.Field.get(3, "float", Types.FloatType.get()),
Types.Field.get(4, "double", Types.DoubleType.get())
}));
}
}