[HUDI-3398] Fix TableSchemaResolver for all file formats and metadata table (#4782)
Co-authored-by: yuezhang <yuezhang@freewheel.tv>
This commit is contained in:
@@ -21,8 +21,10 @@ package org.apache.hudi.common.table;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.SchemaCompatibility;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
@@ -41,6 +43,9 @@ import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.InvalidTableException;
|
||||
import org.apache.hudi.io.storage.HoodieHFileReader;
|
||||
|
||||
import org.apache.hudi.io.storage.HoodieOrcReader;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
@@ -78,29 +83,26 @@ public class TableSchemaResolver {
|
||||
try {
|
||||
switch (metaClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
// For COW table, the file has data written must be in parquet format currently.
|
||||
// For COW table, the file has data written must be in parquet or orc format currently.
|
||||
if (instantAndCommitMetadata.isPresent()) {
|
||||
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
|
||||
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
|
||||
return readSchemaFromBaseFile(new Path(filePath));
|
||||
return readSchemaFromBaseFile(filePath);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Could not find any data file written for commit, "
|
||||
+ "so could not get schema for table " + metaClient.getBasePath());
|
||||
}
|
||||
case MERGE_ON_READ:
|
||||
// For MOR table, the file has data written may be a parquet file or .log file.
|
||||
// For MOR table, the file has data written may be a parquet file, .log file, orc file or hfile.
|
||||
// Determine the file format based on the file name, and then extract schema from it.
|
||||
if (instantAndCommitMetadata.isPresent()) {
|
||||
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
|
||||
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
|
||||
if (filePath.contains(HoodieLogFile.DELTA_EXTENSION)) {
|
||||
if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
|
||||
// this is a log file
|
||||
return readSchemaFromLogFile(new Path(filePath));
|
||||
} else if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
|
||||
// this is a parquet file
|
||||
return readSchemaFromBaseFile(new Path(filePath));
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown file format :" + filePath);
|
||||
return readSchemaFromBaseFile(filePath);
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("Could not find any data file written for commit, "
|
||||
@@ -115,6 +117,21 @@ public class TableSchemaResolver {
|
||||
}
|
||||
}
|
||||
|
||||
private MessageType readSchemaFromBaseFile(String filePath) throws IOException {
|
||||
if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
|
||||
// this is a parquet file
|
||||
return readSchemaFromParquetBaseFile(new Path(filePath));
|
||||
} else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) {
|
||||
// this is a HFile
|
||||
return readSchemaFromHFileBaseFile(new Path(filePath));
|
||||
} else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) {
|
||||
// this is a ORC file
|
||||
return readSchemaFromORCBaseFile(new Path(filePath));
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown base file format :" + filePath);
|
||||
}
|
||||
}
|
||||
|
||||
public Schema getTableAvroSchemaFromDataFile() {
|
||||
return convertParquetSchemaToAvro(getTableParquetSchemaFromDataFile());
|
||||
}
|
||||
@@ -409,19 +426,41 @@ public class TableSchemaResolver {
|
||||
/**
|
||||
* Read the parquet schema from a parquet File.
|
||||
*/
|
||||
public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException {
|
||||
public MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException {
|
||||
LOG.info("Reading schema from " + parquetFilePath);
|
||||
|
||||
FileSystem fs = metaClient.getRawFs();
|
||||
if (!fs.exists(parquetFilePath)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
|
||||
}
|
||||
ParquetMetadata fileFooter =
|
||||
ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
|
||||
return fileFooter.getFileMetaData().getSchema();
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the parquet schema from a HFile.
|
||||
*/
|
||||
public MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException {
|
||||
LOG.info("Reading schema from " + hFilePath);
|
||||
|
||||
FileSystem fs = metaClient.getRawFs();
|
||||
CacheConfig cacheConfig = new CacheConfig(fs.getConf());
|
||||
HoodieHFileReader<IndexedRecord> hFileReader = new HoodieHFileReader<>(fs.getConf(), hFilePath, cacheConfig);
|
||||
|
||||
return convertAvroSchemaToParquet(hFileReader.getSchema());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read the parquet schema from a ORC file.
|
||||
*/
|
||||
public MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException {
|
||||
LOG.info("Reading schema from " + orcFilePath);
|
||||
|
||||
FileSystem fs = metaClient.getRawFs();
|
||||
HoodieOrcReader<IndexedRecord> orcReader = new HoodieOrcReader<>(fs.getConf(), orcFilePath);
|
||||
|
||||
return convertAvroSchemaToParquet(orcReader.getSchema());
|
||||
}
|
||||
|
||||
/**
|
||||
* Read schema from a data file from the last compaction commit done.
|
||||
* @throws Exception
|
||||
@@ -438,7 +477,7 @@ public class TableSchemaResolver {
|
||||
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
|
||||
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
|
||||
+ lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
|
||||
return readSchemaFromBaseFile(new Path(filePath));
|
||||
return readSchemaFromBaseFile(filePath);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
package org.apache.hudi.common.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@@ -222,8 +223,15 @@ public class OrcUtils extends BaseFileUtils {
|
||||
public Schema readAvroSchema(Configuration conf, Path orcFilePath) {
|
||||
try {
|
||||
Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf));
|
||||
TypeDescription orcSchema = reader.getSchema();
|
||||
return AvroOrcUtils.createAvroSchema(orcSchema);
|
||||
if (reader.hasMetadataValue("orc.avro.schema")) {
|
||||
ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema");
|
||||
byte[] bytes = new byte[metadataValue.remaining()];
|
||||
metadataValue.get(bytes);
|
||||
return new Schema.Parser().parse(new String(bytes));
|
||||
} else {
|
||||
TypeDescription orcSchema = reader.getSchema();
|
||||
return AvroOrcUtils.createAvroSchema(orcSchema);
|
||||
}
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user