1
0

[HUDI-4081][HUDI-4472] Addressing Spark SQL vs Spark DS performance gap (#6213)

This commit is contained in:
Alexey Kudinkin
2022-07-28 15:36:03 -07:00
committed by GitHub
parent 70b5cf6dab
commit cfd0c1ee34
14 changed files with 382 additions and 189 deletions

View File

@@ -316,6 +316,9 @@ public class TableSchemaResolver {
* @param oldSchema Older schema to check.
* @param newSchema Newer schema to check.
* @return True if the schema validation is successful
*
* TODO revisit this method: it's implemented incorrectly as it might be applying different criteria
* to top-level record and nested record (for ex, if that nested record is contained w/in an array)
*/
public static boolean isSchemaCompatible(Schema oldSchema, Schema newSchema) {
if (oldSchema.getType() == newSchema.getType() && newSchema.getType() == Schema.Type.RECORD) {
@@ -366,13 +369,31 @@ public class TableSchemaResolver {
return isSchemaCompatible(new Schema.Parser().parse(oldSchema), new Schema.Parser().parse(newSchema));
}
/**
* Returns table's latest Avro {@link Schema} iff table is non-empty (ie there's at least
* a single commit)
*
* This method differs from {@link #getTableAvroSchema(boolean)} in that it won't fallback
* to use table's schema used at creation
*/
public Option<Schema> getTableAvroSchemaFromLatestCommit(boolean includeMetadataFields) throws Exception {
if (metaClient.isTimelineNonEmpty()) {
return Option.of(getTableAvroSchemaInternal(includeMetadataFields, Option.empty()));
}
return Option.empty();
}
/**
* Get latest schema either from incoming schema or table schema.
* @param writeSchema incoming batch's write schema.
* @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted. {@code false} otherwise.
* @param converterFn converter function to be called over table schema (to add namespace may be). Each caller can decide if any conversion is required.
* @return the latest schema.
*
* @deprecated will be removed (HUDI-4472)
*/
@Deprecated
public Schema getLatestSchema(Schema writeSchema, boolean convertTableSchemaToAddNamespace,
Function1<Schema, Schema> converterFn) {
Schema latestSchema = writeSchema;