1
0

[HUDI-3213] Making commit preserve metadata to true for compaction (#4811)

* Making commit preserve metadata to true

* Fixing integ tests

* Fixing preserve commit metadata for metadata table

* fixed bootstrap tests

* temp diff

* Fixing merge handle

* renaming fallback record

* fixing build issue

* Fixing test failures
This commit is contained in:
Sivabalan Narayanan
2022-03-07 07:32:05 -05:00
committed by GitHub
parent 6f57bbfac4
commit 3539578ccb
15 changed files with 88 additions and 48 deletions

View File

@@ -176,7 +176,7 @@ public class HoodieAvroUtils {
/**
* Adds the Hoodie metadata fields to the given schema.
*
* @param schema The schema
* @param schema The schema
* @param withOperationField Whether to include the '_hoodie_operation' field
*/
public static Schema addMetadataFields(Schema schema, boolean withOperationField) {
@@ -276,7 +276,7 @@ public class HoodieAvroUtils {
List<Schema.Field> toBeAddedFields = new ArrayList<>();
Schema recordSchema = Schema.createRecord("HoodieRecordKey", "", "", false);
for (Schema.Field schemaField: fileSchema.getFields()) {
for (Schema.Field schemaField : fileSchema.getFields()) {
if (fields.contains(schemaField.name())) {
toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultVal()));
}
@@ -303,7 +303,7 @@ public class HoodieAvroUtils {
* engines have varying constraints regarding treating the case-sensitivity of fields, its best to let caller
* determine that.
*
* @param schema Passed in schema
* @param schema Passed in schema
* @param newFieldNames Null Field names to be added
*/
public static Schema appendNullSchemaFields(Schema schema, List<String> newFieldNames) {
@@ -382,10 +382,34 @@ public class HoodieAvroUtils {
return newRecord;
}
public static GenericRecord rewriteRecord(GenericRecord genericRecord, Schema newSchema, boolean copyOverMetaFields, GenericRecord fallbackRecord) {
GenericRecord newRecord = new GenericData.Record(newSchema);
boolean isSpecificRecord = genericRecord instanceof SpecificRecordBase;
for (Schema.Field f : newSchema.getFields()) {
if (!(isSpecificRecord && isMetadataField(f.name()))) {
copyOldValueOrSetDefault(genericRecord, newRecord, f);
}
if (isMetadataField(f.name()) && copyOverMetaFields) {
// if meta field exists in primary generic record, copy over.
if (genericRecord.getSchema().getField(f.name()) != null) {
copyOldValueOrSetDefault(genericRecord, newRecord, f);
} else if (fallbackRecord != null && fallbackRecord.getSchema().getField(f.name()) != null) {
// if not, try to copy from the fallback record.
copyOldValueOrSetDefault(fallbackRecord, newRecord, f);
}
}
}
if (!GenericData.get().validate(newSchema, newRecord)) {
throw new SchemaCompatibilityException(
"Unable to validate the rewritten record " + genericRecord + " against schema " + newSchema);
}
return newRecord;
}
/**
* Converts list of {@link GenericRecord} provided into the {@link GenericRecord} adhering to the
* provided {@code newSchema}.
*
* <p>
* To better understand conversion rules please check {@link #rewriteRecord(GenericRecord, Schema)}
*/
public static List<GenericRecord> rewriteRecords(List<GenericRecord> records, Schema newSchema) {
@@ -491,9 +515,8 @@ public class HoodieAvroUtils {
* Returns the string value of the given record {@code rec} and field {@code fieldName}.
* The field and value both could be missing.
*
* @param rec The record
* @param rec The record
* @param fieldName The field name
*
* @return the string form of the field
* or empty if the schema does not contain the field name or the value is null
*/
@@ -507,7 +530,7 @@ public class HoodieAvroUtils {
* This method converts values for fields with certain Avro/Parquet data types that require special handling.
*
* @param fieldSchema avro field schema
* @param fieldValue avro field value
* @param fieldValue avro field value
* @return field value either converted (for certain data types) or as it is.
*/
public static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) {
@@ -527,15 +550,15 @@ public class HoodieAvroUtils {
/**
* This method converts values for fields with certain Avro Logical data types that require special handling.
*
* <p>
* Logical Date Type is converted to actual Date value instead of Epoch Integer which is how it is
* represented/stored in parquet.
*
* <p>
* Decimal Data Type is converted to actual decimal value instead of bytes/fixed which is how it is
* represented/stored in parquet.
*
* @param fieldSchema avro field schema
* @param fieldValue avro field value
* @param fieldValue avro field value
* @return field value either converted (for certain data types) or as it is.
*/
private static Object convertValueForAvroLogicalTypes(Schema fieldSchema, Object fieldValue, boolean consistentLogicalTimestampEnabled) {
@@ -569,6 +592,7 @@ public class HoodieAvroUtils {
/**
* Sanitizes Name according to Avro rule for names.
* Removes characters other than the ones mentioned in https://avro.apache.org/docs/current/spec.html#names .
*
* @param name input name
* @return sanitized name
*/

View File

@@ -42,6 +42,8 @@ public abstract class HoodieRecord<T> implements Serializable {
public static final String OPERATION_METADATA_FIELD = "_hoodie_operation";
public static final String HOODIE_IS_DELETED = "_hoodie_is_deleted";
public static int FILENAME_METADATA_FIELD_POS = 4;
public static final List<String> HOODIE_META_COLUMNS =
CollectionUtils.createImmutableList(COMMIT_TIME_METADATA_FIELD, COMMIT_SEQNO_METADATA_FIELD,
RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD, FILENAME_METADATA_FIELD);

View File

@@ -172,9 +172,9 @@ public class HoodieTableConfig extends HoodieConfig {
.noDefaultValue()
.withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table");
public static final ConfigProperty<String> POPULATE_META_FIELDS = ConfigProperty
public static final ConfigProperty<Boolean> POPULATE_META_FIELDS = ConfigProperty
.key("hoodie.populate.meta.fields")
.defaultValue("true")
.defaultValue(true)
.withDocumentation("When enabled, populates all meta fields. When disabled, no meta fields are populated "
+ "and incremental queries will not be functional. This is only meant to be used for append only/immutable data for batch processing");