1
0

[HUDI-3130] Fixing Hive getSchema for RT tables addressing different partitions having different schemas (#4468)

* Fixing Hive getSchema for RT tables

* Addressing feedback

* temp diff

* fixing tests after spark datasource read support for metadata table is merged to master

* Adding multi-partition schema evolution tests to HoodieRealTimeRecordReader

Co-authored-by: Aditya Tiwari <aditya.tiwari@flipkart.com>
Co-authored-by: sivabalan <n.siva.b@gmail.com>
This commit is contained in:
Aditya Tiwari
2022-03-06 07:51:35 +05:30
committed by GitHub
parent 6a46130037
commit 051ad0b033
9 changed files with 174 additions and 56 deletions

View File

@@ -21,21 +21,18 @@ package org.apache.hudi.hadoop.realtime;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodiePayloadProps;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.LogReaderUtils;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hadoop.InputSplitUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -57,6 +54,7 @@ public abstract class AbstractRealtimeRecordReader {
private Schema readerSchema;
private Schema writerSchema;
private Schema hiveSchema;
private HoodieTableMetaClient metaClient;
public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) {
this.split = split;
@@ -65,15 +63,15 @@ public abstract class AbstractRealtimeRecordReader {
LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
LOG.info("partitioningColumns ==> " + job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""));
try {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build();
metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build();
if (metaClient.getTableConfig().getPreCombineField() != null) {
this.payloadProps.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, metaClient.getTableConfig().getPreCombineField());
}
this.usesCustomPayload = usesCustomPayload(metaClient);
LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
init();
} catch (IOException e) {
throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
} catch (Exception e) {
throw new HoodieException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
}
}
@@ -83,19 +81,14 @@ public abstract class AbstractRealtimeRecordReader {
}
/**
* Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls
* Gets schema from HoodieTableMetaClient. If not, falls
* back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into the
* job conf.
*/
private void init() throws IOException {
Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogFiles(), jobConf);
if (schemaFromLogFile == null) {
writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf);
LOG.info("Writer Schema From Parquet => " + writerSchema.getFields());
} else {
writerSchema = schemaFromLogFile;
LOG.info("Writer Schema From Log => " + writerSchema.toString(true));
}
private void init() throws Exception {
LOG.info("Getting writer schema from table avro schema ");
writerSchema = new TableSchemaResolver(metaClient).getTableAvroSchema();
// Add partitioning fields to writer schema for resulting row to contain null values for these fields
String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
List<String> partitioningFields =