Improving out of box experience for data source

- Fixes #246 - Bump up default parallelism to 1500, to handle large upserts - Add docs on s3 confuration & tuning tips with tested spark knobs - Fix bug to not duplicate hoodie metadata fields when input dataframe is another hoodie dataset - Improve speed of ROTablePathFilter by removing directory check - Move to spark-avro 4.0 to handle issue with nested fields with same name - Keep AvroConversionUtils in sync with spark-avro 4.0
2018-01-05 14:06:18 -08:00
parent a97814462d
commit 85dd265b7b
8 changed files with 112 additions and 19 deletions
--- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java
+++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java
@@ -74,6 +74,13 @@ public class HoodieAvroUtils {
    return reader.read(null, decoder);
  }

+  public static boolean isMetadataField(String fieldName) {
+    return HoodieRecord.COMMIT_TIME_METADATA_FIELD.equals(fieldName)
+        || HoodieRecord.COMMIT_SEQNO_METADATA_FIELD.equals(fieldName)
+        || HoodieRecord.RECORD_KEY_METADATA_FIELD.equals(fieldName)
+        || HoodieRecord.PARTITION_PATH_METADATA_FIELD.equals(fieldName)
+        || HoodieRecord.FILENAME_METADATA_FIELD.equals(fieldName);
+  }

  /**
   * Adds the Hoodie metadata fields to the given schema
@@ -98,7 +105,9 @@ public class HoodieAvroUtils {
    parentFields.add(partitionPathField);
    parentFields.add(fileNameField);
    for (Schema.Field field : schema.getFields()) {
-      parentFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null));
+      if (!isMetadataField(field.name())) {
+        parentFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null));
+      }
    }

    Schema mergedSchema = Schema