[HUDI-3398] Fix TableSchemaResolver for all file formats and metadata table (#4782)

Co-authored-by: yuezhang <yuezhang@freewheel.tv>
2022-02-15 08:02:47 +08:00
parent e639d99387
commit 0a97a9893a
3 changed files with 299 additions and 16 deletions
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java
@@ -21,8 +21,10 @@ package org.apache.hudi.common.table;
 import org.apache.avro.Schema;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.SchemaCompatibility;
+import org.apache.avro.generic.IndexedRecord;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.io.hfile.CacheConfig;
 import org.apache.hudi.avro.HoodieAvroUtils;
 import org.apache.hudi.common.model.HoodieCommitMetadata;
 import org.apache.hudi.common.model.HoodieFileFormat;
@@ -41,6 +43,9 @@ import org.apache.hudi.common.util.StringUtils;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.exception.InvalidTableException;
+import org.apache.hudi.io.storage.HoodieHFileReader;
+
+import org.apache.hudi.io.storage.HoodieOrcReader;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.parquet.avro.AvroSchemaConverter;
@@ -78,29 +83,26 @@ public class TableSchemaResolver {
    try {
      switch (metaClient.getTableType()) {
        case COPY_ON_WRITE:
-          // For COW table, the file has data written must be in parquet format currently.
+          // For COW table, the file has data written must be in parquet or orc format currently.
          if (instantAndCommitMetadata.isPresent()) {
            HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
            String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
-            return readSchemaFromBaseFile(new Path(filePath));
+            return readSchemaFromBaseFile(filePath);
          } else {
            throw new IllegalArgumentException("Could not find any data file written for commit, "
                + "so could not get schema for table " + metaClient.getBasePath());
          }
        case MERGE_ON_READ:
-          // For MOR table, the file has data written may be a parquet file or .log file.
+          // For MOR table, the file has data written may be a parquet file, .log file, orc file or hfile.
          // Determine the file format based on the file name, and then extract schema from it.
          if (instantAndCommitMetadata.isPresent()) {
            HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
            String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
-            if (filePath.contains(HoodieLogFile.DELTA_EXTENSION)) {
+            if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
              // this is a log file
              return readSchemaFromLogFile(new Path(filePath));
-            } else if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
-              // this is a parquet file
-              return readSchemaFromBaseFile(new Path(filePath));
            } else {
-              throw new IllegalArgumentException("Unknown file format :" + filePath);
+              return readSchemaFromBaseFile(filePath);
            }
          } else {
            throw new IllegalArgumentException("Could not find any data file written for commit, "
@@ -115,6 +117,21 @@ public class TableSchemaResolver {
    }
  }

+  private MessageType readSchemaFromBaseFile(String filePath) throws IOException {
+    if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
+      // this is a parquet file
+      return readSchemaFromParquetBaseFile(new Path(filePath));
+    } else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) {
+      // this is a HFile
+      return readSchemaFromHFileBaseFile(new Path(filePath));
+    } else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) {
+      // this is a ORC file
+      return readSchemaFromORCBaseFile(new Path(filePath));
+    } else {
+      throw new IllegalArgumentException("Unknown base file format :" + filePath);
+    }
+  }
+
  public Schema getTableAvroSchemaFromDataFile() {
    return convertParquetSchemaToAvro(getTableParquetSchemaFromDataFile());
  }
@@ -409,19 +426,41 @@ public class TableSchemaResolver {
  /**
   * Read the parquet schema from a parquet File.
   */
-  public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException {
+  public MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException {
    LOG.info("Reading schema from " + parquetFilePath);

    FileSystem fs = metaClient.getRawFs();
-    if (!fs.exists(parquetFilePath)) {
-      throw new IllegalArgumentException(
-          "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
-    }
    ParquetMetadata fileFooter =
        ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
    return fileFooter.getFileMetaData().getSchema();
  }

+  /**
+   * Read the parquet schema from a HFile.
+   */
+  public MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException {
+    LOG.info("Reading schema from " + hFilePath);
+
+    FileSystem fs = metaClient.getRawFs();
+    CacheConfig cacheConfig = new CacheConfig(fs.getConf());
+    HoodieHFileReader<IndexedRecord> hFileReader = new HoodieHFileReader<>(fs.getConf(), hFilePath, cacheConfig);
+
+    return convertAvroSchemaToParquet(hFileReader.getSchema());
+  }
+
+
+  /**
+   * Read the parquet schema from a ORC file.
+   */
+  public MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException {
+    LOG.info("Reading schema from " + orcFilePath);
+
+    FileSystem fs = metaClient.getRawFs();
+    HoodieOrcReader<IndexedRecord> orcReader = new HoodieOrcReader<>(fs.getConf(), orcFilePath);
+
+    return convertAvroSchemaToParquet(orcReader.getSchema());
+  }
+
  /**
   * Read schema from a data file from the last compaction commit done.
   * @throws Exception
@@ -438,7 +477,7 @@ public class TableSchemaResolver {
    String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
        .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
            + lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
-    return readSchemaFromBaseFile(new Path(filePath));
+    return readSchemaFromBaseFile(filePath);
  }

  /**
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java
@@ -19,6 +19,7 @@
 package org.apache.hudi.common.util;

 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -222,8 +223,15 @@ public class OrcUtils extends BaseFileUtils {
  public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
    try {
      Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
-      TypeDescription orcSchema = reader.getSchema();
-      return AvroOrcUtils.createAvroSchema(orcSchema);
+      if (reader.hasMetadataValue("orc.avro.schema")) {
+        ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema");
+        byte[] bytes = new byte[metadataValue.remaining()];
+        metadataValue.get(bytes);
+        return new Schema.Parser().parse(new String(bytes));
+      } else {
+        TypeDescription orcSchema = reader.getSchema();
+        return AvroOrcUtils.createAvroSchema(orcSchema);
+      }
    } catch (IOException io) {
      throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
    }