Improving out of box experience for data source
- Fixes #246 - Bump up default parallelism to 1500, to handle large upserts - Add docs on s3 confuration & tuning tips with tested spark knobs - Fix bug to not duplicate hoodie metadata fields when input dataframe is another hoodie dataset - Improve speed of ROTablePathFilter by removing directory check - Move to spark-avro 4.0 to handle issue with nested fields with same name - Keep AvroConversionUtils in sync with spark-avro 4.0
This commit is contained in:
committed by
vinoth chandar
parent
a97814462d
commit
85dd265b7b
@@ -74,6 +74,13 @@ public class HoodieAvroUtils {
|
||||
return reader.read(null, decoder);
|
||||
}
|
||||
|
||||
public static boolean isMetadataField(String fieldName) {
|
||||
return HoodieRecord.COMMIT_TIME_METADATA_FIELD.equals(fieldName)
|
||||
|| HoodieRecord.COMMIT_SEQNO_METADATA_FIELD.equals(fieldName)
|
||||
|| HoodieRecord.RECORD_KEY_METADATA_FIELD.equals(fieldName)
|
||||
|| HoodieRecord.PARTITION_PATH_METADATA_FIELD.equals(fieldName)
|
||||
|| HoodieRecord.FILENAME_METADATA_FIELD.equals(fieldName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the Hoodie metadata fields to the given schema
|
||||
@@ -98,7 +105,9 @@ public class HoodieAvroUtils {
|
||||
parentFields.add(partitionPathField);
|
||||
parentFields.add(fileNameField);
|
||||
for (Schema.Field field : schema.getFields()) {
|
||||
parentFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null));
|
||||
if (!isMetadataField(field.name())) {
|
||||
parentFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null));
|
||||
}
|
||||
}
|
||||
|
||||
Schema mergedSchema = Schema
|
||||
|
||||
Reference in New Issue
Block a user