|
|
|
|
@@ -27,6 +27,7 @@ import org.apache.hudi.common.testutils.HoodieTestUtils;
|
|
|
|
|
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
|
|
|
|
import org.apache.hudi.common.util.ParquetUtils;
|
|
|
|
|
import org.apache.hudi.config.HoodieWriteConfig;
|
|
|
|
|
import org.apache.hudi.exception.HoodieUpsertException;
|
|
|
|
|
import org.apache.hudi.io.HoodieCreateHandle;
|
|
|
|
|
import org.apache.hudi.io.HoodieMergeHandle;
|
|
|
|
|
import org.apache.hudi.table.HoodieSparkTable;
|
|
|
|
|
@@ -34,12 +35,14 @@ import org.apache.hudi.testutils.HoodieClientTestHarness;
|
|
|
|
|
|
|
|
|
|
import org.apache.avro.Schema;
|
|
|
|
|
import org.apache.avro.generic.GenericRecord;
|
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
|
import org.apache.parquet.avro.AvroReadSupport;
|
|
|
|
|
import org.apache.parquet.io.InvalidRecordException;
|
|
|
|
|
import org.apache.parquet.io.ParquetDecodingException;
|
|
|
|
|
import org.junit.jupiter.api.AfterEach;
|
|
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
import org.junit.jupiter.api.function.Executable;
|
|
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
@@ -51,7 +54,7 @@ import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
|
|
|
|
|
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
|
|
|
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
|
|
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
|
|
|
|
|
|
|
|
|
public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
|
|
|
|
|
|
|
|
|
|
@@ -68,77 +71,158 @@ public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
|
|
|
|
|
cleanupResources();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
public void testSchemaEvolutionOnUpdate() throws Exception {
|
|
|
|
|
// Create a bunch of records with a old version of schema
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
|
|
|
|
|
private WriteStatus prepareFirstRecordCommit(List<String> recordsStrs) throws IOException {
|
|
|
|
|
// Create a bunch of records with an old version of schema
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.avsc");
|
|
|
|
|
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
|
|
|
|
|
|
|
|
|
final List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
|
|
|
|
|
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
|
|
|
|
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
|
|
|
|
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
|
|
|
|
List<HoodieRecord> insertRecords = new ArrayList<>();
|
|
|
|
|
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
|
|
|
|
|
insertRecords
|
|
|
|
|
.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
|
|
|
|
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
|
|
|
|
|
insertRecords
|
|
|
|
|
.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
|
|
|
|
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
|
|
|
|
|
insertRecords
|
|
|
|
|
.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
|
|
|
|
|
|
|
|
|
for (String recordStr : recordsStrs) {
|
|
|
|
|
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
|
|
|
|
|
insertRecords
|
|
|
|
|
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
|
|
|
|
|
}
|
|
|
|
|
Map<String, HoodieRecord> insertRecordMap = insertRecords.stream()
|
|
|
|
|
.collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity()));
|
|
|
|
|
HoodieCreateHandle createHandle =
|
|
|
|
|
new HoodieCreateHandle(config, "100", table, rowChange1.getPartitionPath(), "f1-0", insertRecordMap, supplier);
|
|
|
|
|
new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier);
|
|
|
|
|
createHandle.write();
|
|
|
|
|
return createHandle.close();
|
|
|
|
|
}).collect();
|
|
|
|
|
|
|
|
|
|
final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
|
|
|
|
|
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
|
|
|
|
|
return statuses.get(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Now try an update with an evolved schema
|
|
|
|
|
// Evolved schema does not have guarantee on preserving the original field ordering
|
|
|
|
|
final HoodieWriteConfig config2 = makeHoodieClientConfig("/exampleEvolvedSchema.txt");
|
|
|
|
|
final WriteStatus insertResult = statuses.get(0);
|
|
|
|
|
String fileId = insertResult.getFileId();
|
|
|
|
|
private List<String> generateMultipleRecordsForExampleSchema() {
|
|
|
|
|
List<String> recordsStrs = new ArrayList<>();
|
|
|
|
|
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
|
|
|
|
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
|
|
|
|
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
|
|
|
|
recordsStrs.add(recordStr1);
|
|
|
|
|
recordsStrs.add(recordStr2);
|
|
|
|
|
recordsStrs.add(recordStr3);
|
|
|
|
|
return recordsStrs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
final HoodieSparkTable table2 = HoodieSparkTable.create(config, context);
|
|
|
|
|
assertEquals(1, jsc.parallelize(Arrays.asList(1)).map(x -> {
|
|
|
|
|
// New content with values for the newly added field
|
|
|
|
|
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
|
|
|
|
List<HoodieRecord> updateRecords = new ArrayList<>();
|
|
|
|
|
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
|
|
|
|
|
HoodieRecord record1 =
|
|
|
|
|
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
|
|
|
|
record1.unseal();
|
|
|
|
|
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
|
|
|
|
|
record1.seal();
|
|
|
|
|
updateRecords.add(record1);
|
|
|
|
|
private List<String> generateOneRecordForExampleSchema() {
|
|
|
|
|
List<String> recordsStrs = new ArrayList<>();
|
|
|
|
|
String recordStr = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
|
|
|
|
recordsStrs.add(recordStr);
|
|
|
|
|
return recordsStrs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assertDoesNotThrow(() -> {
|
|
|
|
|
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(config2, "101", table2,
|
|
|
|
|
updateRecords.iterator(), record1.getPartitionPath(), fileId, supplier);
|
|
|
|
|
Configuration conf = new Configuration();
|
|
|
|
|
AvroReadSupport.setAvroReadSchema(conf, mergeHandle.getWriterSchemaWithMetafields());
|
|
|
|
|
List<GenericRecord> oldRecords = ParquetUtils.readAvroRecords(conf,
|
|
|
|
|
new Path(config2.getBasePath() + "/" + insertResult.getStat().getPath()));
|
|
|
|
|
private void assertSchemaEvolutionOnUpdateResult(WriteStatus insertResult, HoodieSparkTable updateTable,
|
|
|
|
|
List<HoodieRecord> updateRecords, String assertMsg, boolean isAssertThrow, Class expectedExceptionType) {
|
|
|
|
|
jsc.parallelize(Arrays.asList(1)).map(x -> {
|
|
|
|
|
Executable executable = () -> {
|
|
|
|
|
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(updateTable.getConfig(), "101", updateTable,
|
|
|
|
|
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier);
|
|
|
|
|
AvroReadSupport.setAvroReadSchema(updateTable.getHadoopConf(), mergeHandle.getWriterSchemaWithMetafields());
|
|
|
|
|
List<GenericRecord> oldRecords = ParquetUtils.readAvroRecords(updateTable.getHadoopConf(),
|
|
|
|
|
new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath()));
|
|
|
|
|
for (GenericRecord rec : oldRecords) {
|
|
|
|
|
mergeHandle.write(rec);
|
|
|
|
|
}
|
|
|
|
|
mergeHandle.close();
|
|
|
|
|
}, "UpdateFunction could not read records written with exampleSchema.txt using the "
|
|
|
|
|
+ "exampleEvolvedSchema.txt");
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
if (isAssertThrow) {
|
|
|
|
|
assertThrows(expectedExceptionType, executable, assertMsg);
|
|
|
|
|
} else {
|
|
|
|
|
assertDoesNotThrow(executable, assertMsg);
|
|
|
|
|
}
|
|
|
|
|
return 1;
|
|
|
|
|
}).collect().size());
|
|
|
|
|
}).collect();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private List<HoodieRecord> buildUpdateRecords(String recordStr, String insertFileId) throws IOException {
|
|
|
|
|
List<HoodieRecord> updateRecords = new ArrayList<>();
|
|
|
|
|
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
|
|
|
|
|
HoodieRecord record =
|
|
|
|
|
new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange);
|
|
|
|
|
record.setCurrentLocation(new HoodieRecordLocation("101", insertFileId));
|
|
|
|
|
record.seal();
|
|
|
|
|
updateRecords.add(record);
|
|
|
|
|
return updateRecords;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
public void testSchemaEvolutionOnUpdateSuccessWithAddColumnHaveDefault() throws Exception {
|
|
|
|
|
final WriteStatus insertResult = prepareFirstRecordCommit(generateMultipleRecordsForExampleSchema());
|
|
|
|
|
// Now try an update with an evolved schema
|
|
|
|
|
// Evolved schema does not have guarantee on preserving the original field ordering
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleEvolvedSchema.avsc");
|
|
|
|
|
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
|
|
|
|
// New content with values for the newly added field
|
|
|
|
|
String recordStr = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
|
|
|
|
List<HoodieRecord> updateRecords = buildUpdateRecords(recordStr, insertResult.getFileId());
|
|
|
|
|
String assertMsg = "UpdateFunction could not read records written with exampleSchema.avsc using the "
|
|
|
|
|
+ "exampleEvolvedSchema.avsc";
|
|
|
|
|
assertSchemaEvolutionOnUpdateResult(insertResult, table, updateRecords, assertMsg, false, null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
public void testSchemaEvolutionOnUpdateSuccessWithChangeColumnOrder() throws Exception {
|
|
|
|
|
final WriteStatus insertResult = prepareFirstRecordCommit(generateMultipleRecordsForExampleSchema());
|
|
|
|
|
// Now try an update with an evolved schema
|
|
|
|
|
// Evolved schema does not have guarantee on preserving the original field ordering
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleEvolvedSchemaChangeOrder.avsc");
|
|
|
|
|
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
|
|
|
|
String recordStr = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"added_field\":1},\"number\":12";
|
|
|
|
|
List<HoodieRecord> updateRecords = buildUpdateRecords(recordStr, insertResult.getFileId());
|
|
|
|
|
String assertMsg = "UpdateFunction could not read records written with exampleSchema.avsc using the "
|
|
|
|
|
+ "exampleEvolvedSchemaChangeOrder.avsc as column order change";
|
|
|
|
|
assertSchemaEvolutionOnUpdateResult(insertResult, table, updateRecords, assertMsg, false, null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
public void testSchemaEvolutionOnUpdateMisMatchWithDeleteColumn() throws Exception {
|
|
|
|
|
final WriteStatus insertResult = prepareFirstRecordCommit(generateOneRecordForExampleSchema());
|
|
|
|
|
// Now try an update with an evolved schema
|
|
|
|
|
// Evolved schema does not have guarantee on preserving the original field ordering
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleEvolvedSchemaDeleteColumn.avsc");
|
|
|
|
|
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
|
|
|
|
String recordStr = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\"}";
|
|
|
|
|
List<HoodieRecord> updateRecords = buildUpdateRecords(recordStr, insertResult.getFileId());
|
|
|
|
|
String assertMsg = "UpdateFunction when delete column, Parquet/Avro schema mismatch: Avro field 'xxx' not found";
|
|
|
|
|
assertSchemaEvolutionOnUpdateResult(insertResult, table, updateRecords, assertMsg, true, InvalidRecordException.class);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
public void testSchemaEvolutionOnUpdateMisMatchWithAddColumnNotHaveDefault() throws Exception {
|
|
|
|
|
final WriteStatus insertResult = prepareFirstRecordCommit(generateOneRecordForExampleSchema());
|
|
|
|
|
// Now try an update with an evolved schema
|
|
|
|
|
// Evolved schema does not have guarantee on preserving the original field ordering
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleEvolvedSchemaColumnRequire.avsc");
|
|
|
|
|
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
|
|
|
|
String recordStr = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
|
|
|
|
List<HoodieRecord> updateRecords = buildUpdateRecords(recordStr, insertResult.getFileId());
|
|
|
|
|
String assertMsg = "UpdateFunction could not read records written with exampleSchema.avsc using the "
|
|
|
|
|
+ "exampleEvolvedSchemaColumnRequire.avsc, because old records do not have required column added_field";
|
|
|
|
|
assertSchemaEvolutionOnUpdateResult(insertResult, table, updateRecords, assertMsg, true, HoodieUpsertException.class);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
public void testSchemaEvolutionOnUpdateMisMatchWithChangeColumnType() throws Exception {
|
|
|
|
|
final WriteStatus insertResult = prepareFirstRecordCommit(generateOneRecordForExampleSchema());
|
|
|
|
|
// Now try an update with an evolved schema
|
|
|
|
|
// Evolved schema does not have guarantee on preserving the original field ordering
|
|
|
|
|
final HoodieWriteConfig config = makeHoodieClientConfig("/exampleEvolvedSchemaColumnType.avsc");
|
|
|
|
|
final HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
|
|
|
|
String recordStr = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
|
|
|
|
|
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":\"12\"}";
|
|
|
|
|
List<HoodieRecord> updateRecords = buildUpdateRecords(recordStr, insertResult.getFileId());
|
|
|
|
|
String assertMsg = "UpdateFunction when change column type, org.apache.parquet.avro.AvroConverters$FieldUTF8Converter";
|
|
|
|
|
assertSchemaEvolutionOnUpdateResult(insertResult, table, updateRecords, assertMsg, true, ParquetDecodingException.class);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private HoodieWriteConfig makeHoodieClientConfig(String name) {
|
|
|
|
|
|