[HUDI-3130] Fixing Hive getSchema for RT tables addressing different partitions having different schemas (#4468)

* Fixing Hive getSchema for RT tables * Addressing feedback * temp diff * fixing tests after spark datasource read support for metadata table is merged to master * Adding multi-partition schema evolution tests to HoodieRealTimeRecordReader Co-authored-by: Aditya Tiwari <aditya.tiwari@flipkart.com> Co-authored-by: sivabalan <n.siva.b@gmail.com>
2022-03-06 07:51:35 +05:30
parent 6a46130037
commit 051ad0b033
9 changed files with 174 additions and 56 deletions
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java
@@ -21,21 +21,18 @@ package org.apache.hudi.hadoop.realtime;
 import org.apache.hudi.common.model.HoodieAvroPayload;
 import org.apache.hudi.common.model.HoodiePayloadProps;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
-import org.apache.hudi.common.table.log.LogReaderUtils;
-import org.apache.hudi.exception.HoodieIOException;
-import org.apache.hudi.hadoop.InputSplitUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils;

 import org.apache.avro.Schema;
 import org.apache.avro.Schema.Field;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
 import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
-import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -57,6 +54,7 @@ public abstract class AbstractRealtimeRecordReader {
  private Schema readerSchema;
  private Schema writerSchema;
  private Schema hiveSchema;
+  private HoodieTableMetaClient metaClient;

  public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) {
    this.split = split;
@@ -65,15 +63,15 @@ public abstract class AbstractRealtimeRecordReader {
    LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
    LOG.info("partitioningColumns ==> " + job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""));
    try {
-      HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build();
+      metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build();
      if (metaClient.getTableConfig().getPreCombineField() != null) {
        this.payloadProps.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, metaClient.getTableConfig().getPreCombineField());
      }
      this.usesCustomPayload = usesCustomPayload(metaClient);
      LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
      init();
-    } catch (IOException e) {
-      throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
+    } catch (Exception e) {
+      throw new HoodieException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
    }
  }

@@ -83,19 +81,14 @@ public abstract class AbstractRealtimeRecordReader {
  }

  /**
-   * Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls
+   * Gets schema from HoodieTableMetaClient. If not, falls
   * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into the
   * job conf.
   */
-  private void init() throws IOException {
-    Schema schemaFromLogFile = LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogFiles(), jobConf);
-    if (schemaFromLogFile == null) {
-      writerSchema = InputSplitUtils.getBaseFileSchema((FileSplit)split, jobConf);
-      LOG.info("Writer Schema From Parquet => " + writerSchema.getFields());
-    } else {
-      writerSchema = schemaFromLogFile;
-      LOG.info("Writer Schema From Log => " + writerSchema.toString(true));
-    }
+  private void init() throws Exception {
+    LOG.info("Getting writer schema from table avro schema ");
+    writerSchema = new TableSchemaResolver(metaClient).getTableAvroSchema();
+
    // Add partitioning fields to writer schema for resulting row to contain null values for these fields
    String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
    List<String> partitioningFields =