|
|
|
|
@@ -43,6 +43,7 @@ import org.apache.hudi.common.util.Option;
|
|
|
|
|
import org.apache.hudi.common.util.StringUtils;
|
|
|
|
|
import org.apache.hudi.common.util.collection.Pair;
|
|
|
|
|
import org.apache.hudi.exception.HoodieException;
|
|
|
|
|
import org.apache.hudi.exception.HoodieIOException;
|
|
|
|
|
import org.apache.hudi.exception.HoodieIncompatibleSchemaException;
|
|
|
|
|
import org.apache.hudi.exception.InvalidTableException;
|
|
|
|
|
import org.apache.hudi.internal.schema.InternalSchema;
|
|
|
|
|
@@ -50,6 +51,7 @@ import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
|
|
|
|
|
import org.apache.hudi.internal.schema.utils.SerDeHelper;
|
|
|
|
|
import org.apache.hudi.io.storage.HoodieHFileReader;
|
|
|
|
|
import org.apache.hudi.io.storage.HoodieOrcReader;
|
|
|
|
|
import org.apache.hudi.util.Lazy;
|
|
|
|
|
import org.apache.log4j.LogManager;
|
|
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
|
import org.apache.parquet.avro.AvroSchemaConverter;
|
|
|
|
|
@@ -58,100 +60,56 @@ import org.apache.parquet.hadoop.ParquetFileReader;
|
|
|
|
|
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
|
|
|
|
import org.apache.parquet.schema.MessageType;
|
|
|
|
|
|
|
|
|
|
import javax.annotation.concurrent.ThreadSafe;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Arrays;
|
|
|
|
|
import java.util.Iterator;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
|
|
|
|
|
|
import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema;
|
|
|
|
|
import static org.apache.hudi.avro.AvroSchemaUtils.containsFieldInSchema;
|
|
|
|
|
import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Helper class to read schema from data files and log files and to convert it between different formats.
|
|
|
|
|
*
|
|
|
|
|
* TODO(HUDI-3626) cleanup
|
|
|
|
|
*/
|
|
|
|
|
@ThreadSafe
|
|
|
|
|
public class TableSchemaResolver {
|
|
|
|
|
|
|
|
|
|
private static final Logger LOG = LogManager.getLogger(TableSchemaResolver.class);
|
|
|
|
|
|
|
|
|
|
private final HoodieTableMetaClient metaClient;
|
|
|
|
|
private final boolean hasOperationField;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Signals whether suite of the meta-fields should have additional field designating
|
|
|
|
|
* operation particular record was added by. Note, that determining whether this meta-field
|
|
|
|
|
* should be appended to the schema requires reading out the actual schema of some data file,
|
|
|
|
|
* since it's ultimately the source of truth whether this field has to be represented in
|
|
|
|
|
* the schema
|
|
|
|
|
*/
|
|
|
|
|
private final Lazy<Boolean> hasOperationField;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* NOTE: {@link HoodieCommitMetadata} could be of non-trivial size for large tables (in 100s of Mbs)
|
|
|
|
|
* and therefore we'd want to limit amount of throw-away work being performed while fetching
|
|
|
|
|
* commits' metadata
|
|
|
|
|
*
|
|
|
|
|
* Please check out corresponding methods to fetch commonly used instances of {@link HoodieCommitMetadata}:
|
|
|
|
|
* {@link #getLatestCommitMetadataWithValidSchema()},
|
|
|
|
|
* {@link #getLatestCommitMetadataWithValidSchema()},
|
|
|
|
|
* {@link #getCachedCommitMetadata(HoodieInstant)}
|
|
|
|
|
*/
|
|
|
|
|
private final Lazy<ConcurrentHashMap<HoodieInstant, HoodieCommitMetadata>> commitMetadataCache;
|
|
|
|
|
|
|
|
|
|
private volatile HoodieInstant latestCommitWithValidSchema = null;
|
|
|
|
|
private volatile HoodieInstant latestCommitWithValidData = null;
|
|
|
|
|
|
|
|
|
|
public TableSchemaResolver(HoodieTableMetaClient metaClient) {
|
|
|
|
|
this.metaClient = metaClient;
|
|
|
|
|
this.hasOperationField = hasOperationField();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
|
|
|
|
|
* commit. We will assume that the schema has not changed within a single atomic write.
|
|
|
|
|
*
|
|
|
|
|
* @return Parquet schema for this table
|
|
|
|
|
*/
|
|
|
|
|
private MessageType getTableParquetSchemaFromDataFile() {
|
|
|
|
|
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
|
|
|
|
|
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
|
|
|
|
|
activeTimeline.getLastCommitMetadataWithValidData();
|
|
|
|
|
try {
|
|
|
|
|
switch (metaClient.getTableType()) {
|
|
|
|
|
case COPY_ON_WRITE:
|
|
|
|
|
// For COW table, the file has data written must be in parquet or orc format currently.
|
|
|
|
|
if (instantAndCommitMetadata.isPresent()) {
|
|
|
|
|
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
|
|
|
|
|
Iterator<String> filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().iterator();
|
|
|
|
|
return fetchSchemaFromFiles(filePaths);
|
|
|
|
|
} else {
|
|
|
|
|
throw new IllegalArgumentException("Could not find any data file written for commit, "
|
|
|
|
|
+ "so could not get schema for table " + metaClient.getBasePath());
|
|
|
|
|
}
|
|
|
|
|
case MERGE_ON_READ:
|
|
|
|
|
// For MOR table, the file has data written may be a parquet file, .log file, orc file or hfile.
|
|
|
|
|
// Determine the file format based on the file name, and then extract schema from it.
|
|
|
|
|
if (instantAndCommitMetadata.isPresent()) {
|
|
|
|
|
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
|
|
|
|
|
Iterator<String> filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().iterator();
|
|
|
|
|
return fetchSchemaFromFiles(filePaths);
|
|
|
|
|
} else {
|
|
|
|
|
throw new IllegalArgumentException("Could not find any data file written for commit, "
|
|
|
|
|
+ "so could not get schema for table " + metaClient.getBasePath());
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
LOG.error("Unknown table type " + metaClient.getTableType());
|
|
|
|
|
throw new InvalidTableException(metaClient.getBasePath());
|
|
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
throw new HoodieException("Failed to read data schema", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private MessageType fetchSchemaFromFiles(Iterator<String> filePaths) throws IOException {
|
|
|
|
|
MessageType type = null;
|
|
|
|
|
while (filePaths.hasNext() && type == null) {
|
|
|
|
|
String filePath = filePaths.next();
|
|
|
|
|
if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
|
|
|
|
|
// this is a log file
|
|
|
|
|
type = readSchemaFromLogFile(new Path(filePath));
|
|
|
|
|
} else {
|
|
|
|
|
type = readSchemaFromBaseFile(filePath);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return type;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private MessageType readSchemaFromBaseFile(String filePath) throws IOException {
|
|
|
|
|
if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
|
|
|
|
|
// this is a parquet file
|
|
|
|
|
return readSchemaFromParquetBaseFile(new Path(filePath));
|
|
|
|
|
} else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) {
|
|
|
|
|
// this is a HFile
|
|
|
|
|
return readSchemaFromHFileBaseFile(new Path(filePath));
|
|
|
|
|
} else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) {
|
|
|
|
|
// this is a ORC file
|
|
|
|
|
return readSchemaFromORCBaseFile(new Path(filePath));
|
|
|
|
|
} else {
|
|
|
|
|
throw new IllegalArgumentException("Unknown base file format :" + filePath);
|
|
|
|
|
}
|
|
|
|
|
this.commitMetadataCache = Lazy.lazily(() -> new ConcurrentHashMap<>(2));
|
|
|
|
|
this.hasOperationField = Lazy.lazily(this::hasOperationField);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Schema getTableAvroSchemaFromDataFile() {
|
|
|
|
|
@@ -176,86 +134,25 @@ public class TableSchemaResolver {
|
|
|
|
|
* @throws Exception
|
|
|
|
|
*/
|
|
|
|
|
public Schema getTableAvroSchema(boolean includeMetadataFields) throws Exception {
|
|
|
|
|
Schema schema;
|
|
|
|
|
Option<Schema> schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(includeMetadataFields);
|
|
|
|
|
if (schemaFromCommitMetadata.isPresent()) {
|
|
|
|
|
schema = schemaFromCommitMetadata.get();
|
|
|
|
|
} else {
|
|
|
|
|
Option<Schema> schemaFromTableConfig = metaClient.getTableConfig().getTableCreateSchema();
|
|
|
|
|
if (schemaFromTableConfig.isPresent()) {
|
|
|
|
|
if (includeMetadataFields) {
|
|
|
|
|
schema = HoodieAvroUtils.addMetadataFields(schemaFromTableConfig.get(), hasOperationField);
|
|
|
|
|
} else {
|
|
|
|
|
schema = schemaFromTableConfig.get();
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (includeMetadataFields) {
|
|
|
|
|
schema = getTableAvroSchemaFromDataFile();
|
|
|
|
|
} else {
|
|
|
|
|
schema = HoodieAvroUtils.removeMetadataFields(getTableAvroSchemaFromDataFile());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Option<String[]> partitionFieldsOpt = metaClient.getTableConfig().getPartitionFields();
|
|
|
|
|
if (metaClient.getTableConfig().shouldDropPartitionColumns()) {
|
|
|
|
|
schema = recreateSchemaWhenDropPartitionColumns(partitionFieldsOpt, schema);
|
|
|
|
|
}
|
|
|
|
|
return schema;
|
|
|
|
|
return getTableAvroSchemaInternal(includeMetadataFields, Option.empty());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static Schema recreateSchemaWhenDropPartitionColumns(Option<String[]> partitionFieldsOpt, Schema originSchema) {
|
|
|
|
|
// when hoodie.datasource.write.drop.partition.columns is true, partition columns can't be persisted in data files.
|
|
|
|
|
// And there are no partition schema if the schema is parsed from data files.
|
|
|
|
|
// Here we create partition Fields for this case, and use StringType as the data type.
|
|
|
|
|
Schema schema = originSchema;
|
|
|
|
|
if (partitionFieldsOpt.isPresent() && partitionFieldsOpt.get().length != 0) {
|
|
|
|
|
List<String> partitionFields = Arrays.asList(partitionFieldsOpt.get());
|
|
|
|
|
|
|
|
|
|
final Schema schema0 = originSchema;
|
|
|
|
|
boolean hasPartitionColNotInSchema = partitionFields.stream().anyMatch(
|
|
|
|
|
pt -> !HoodieAvroUtils.containsFieldInSchema(schema0, pt)
|
|
|
|
|
);
|
|
|
|
|
boolean hasPartitionColInSchema = partitionFields.stream().anyMatch(
|
|
|
|
|
pt -> HoodieAvroUtils.containsFieldInSchema(schema0, pt)
|
|
|
|
|
);
|
|
|
|
|
if (hasPartitionColNotInSchema && hasPartitionColInSchema) {
|
|
|
|
|
throw new HoodieIncompatibleSchemaException(
|
|
|
|
|
"Not support: Partial partition fields are still in the schema "
|
|
|
|
|
+ "when enable hoodie.datasource.write.drop.partition.columns");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hasPartitionColNotInSchema) {
|
|
|
|
|
// when hasPartitionColNotInSchema is true and hasPartitionColInSchema is false, all partition columns
|
|
|
|
|
// are not in originSchema. So we create and add them.
|
|
|
|
|
List<Field> newFields = new ArrayList<>();
|
|
|
|
|
for (String partitionField: partitionFields) {
|
|
|
|
|
newFields.add(new Schema.Field(
|
|
|
|
|
partitionField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE));
|
|
|
|
|
}
|
|
|
|
|
schema = appendFieldsToSchema(schema, newFields);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return schema;
|
|
|
|
|
/**
|
|
|
|
|
* Fetches tables schema in Avro format as of the given instant
|
|
|
|
|
*
|
|
|
|
|
* @param instant as of which table's schema will be fetched
|
|
|
|
|
*/
|
|
|
|
|
public Schema getTableAvroSchema(HoodieInstant instant, boolean includeMetadataFields) throws Exception {
|
|
|
|
|
return getTableAvroSchemaInternal(includeMetadataFields, Option.of(instant));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets full schema (user + metadata) for a hoodie table in Parquet format.
|
|
|
|
|
*
|
|
|
|
|
* @return Parquet schema for the table
|
|
|
|
|
* @throws Exception
|
|
|
|
|
*/
|
|
|
|
|
public MessageType getTableParquetSchema() throws Exception {
|
|
|
|
|
Option<Schema> schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(true);
|
|
|
|
|
if (schemaFromCommitMetadata.isPresent()) {
|
|
|
|
|
return convertAvroSchemaToParquet(schemaFromCommitMetadata.get());
|
|
|
|
|
}
|
|
|
|
|
Option<Schema> schemaFromTableConfig = metaClient.getTableConfig().getTableCreateSchema();
|
|
|
|
|
if (schemaFromTableConfig.isPresent()) {
|
|
|
|
|
Schema schema = HoodieAvroUtils.addMetadataFields(schemaFromTableConfig.get(), hasOperationField);
|
|
|
|
|
return convertAvroSchemaToParquet(schema);
|
|
|
|
|
}
|
|
|
|
|
return getTableParquetSchemaFromDataFile();
|
|
|
|
|
return convertAvroSchemaToParquet(getTableAvroSchema(true));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -271,41 +168,43 @@ public class TableSchemaResolver {
|
|
|
|
|
return getTableAvroSchema(false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets users data schema for a hoodie table in Avro format of the instant.
|
|
|
|
|
*
|
|
|
|
|
* @param instant will get the instant data schema
|
|
|
|
|
* @return Avro user data schema
|
|
|
|
|
* @throws Exception
|
|
|
|
|
* @deprecated use {@link #getTableSchemaFromCommitMetadata} instead
|
|
|
|
|
*/
|
|
|
|
|
@Deprecated
|
|
|
|
|
public Schema getTableAvroSchemaWithoutMetadataFields(HoodieInstant instant) throws Exception {
|
|
|
|
|
Option<Schema> schemaFromCommitMetadata = getTableSchemaFromCommitMetadata(instant, false);
|
|
|
|
|
if (schemaFromCommitMetadata.isPresent()) {
|
|
|
|
|
return schemaFromCommitMetadata.get();
|
|
|
|
|
private Schema getTableAvroSchemaInternal(boolean includeMetadataFields, Option<HoodieInstant> instantOpt) {
|
|
|
|
|
Schema schema =
|
|
|
|
|
(instantOpt.isPresent()
|
|
|
|
|
? getTableSchemaFromCommitMetadata(instantOpt.get(), includeMetadataFields)
|
|
|
|
|
: getTableSchemaFromLatestCommitMetadata(includeMetadataFields))
|
|
|
|
|
.or(() ->
|
|
|
|
|
metaClient.getTableConfig().getTableCreateSchema()
|
|
|
|
|
.map(tableSchema ->
|
|
|
|
|
includeMetadataFields
|
|
|
|
|
? HoodieAvroUtils.addMetadataFields(tableSchema, hasOperationField.get())
|
|
|
|
|
: tableSchema)
|
|
|
|
|
)
|
|
|
|
|
.orElseGet(() -> {
|
|
|
|
|
Schema schemaFromDataFile = getTableAvroSchemaFromDataFile();
|
|
|
|
|
return includeMetadataFields
|
|
|
|
|
? schemaFromDataFile
|
|
|
|
|
: HoodieAvroUtils.removeMetadataFields(schemaFromDataFile);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// TODO partition columns have to be appended in all read-paths
|
|
|
|
|
if (metaClient.getTableConfig().shouldDropPartitionColumns()) {
|
|
|
|
|
return metaClient.getTableConfig().getPartitionFields()
|
|
|
|
|
.map(partitionFields -> appendPartitionColumns(schema, partitionFields))
|
|
|
|
|
.orElse(schema);
|
|
|
|
|
}
|
|
|
|
|
Option<Schema> schemaFromTableConfig = metaClient.getTableConfig().getTableCreateSchema();
|
|
|
|
|
if (schemaFromTableConfig.isPresent()) {
|
|
|
|
|
return schemaFromTableConfig.get();
|
|
|
|
|
}
|
|
|
|
|
return HoodieAvroUtils.removeMetadataFields(getTableAvroSchemaFromDataFile());
|
|
|
|
|
|
|
|
|
|
return schema;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit with valid schema.
|
|
|
|
|
*
|
|
|
|
|
* @return Avro schema for this table
|
|
|
|
|
*/
|
|
|
|
|
private Option<Schema> getTableSchemaFromCommitMetadata(boolean includeMetadataFields) {
|
|
|
|
|
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
|
|
|
|
|
metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema();
|
|
|
|
|
private Option<Schema> getTableSchemaFromLatestCommitMetadata(boolean includeMetadataFields) {
|
|
|
|
|
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = getLatestCommitMetadataWithValidSchema();
|
|
|
|
|
if (instantAndCommitMetadata.isPresent()) {
|
|
|
|
|
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
|
|
|
|
|
String schemaStr = commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
|
|
|
|
|
Schema schema = new Schema.Parser().parse(schemaStr);
|
|
|
|
|
if (includeMetadataFields) {
|
|
|
|
|
schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField);
|
|
|
|
|
schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField.get());
|
|
|
|
|
}
|
|
|
|
|
return Option.of(schema);
|
|
|
|
|
} else {
|
|
|
|
|
@@ -313,17 +212,9 @@ public class TableSchemaResolver {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the instant.
|
|
|
|
|
*
|
|
|
|
|
* @return Avro schema for this table
|
|
|
|
|
*/
|
|
|
|
|
private Option<Schema> getTableSchemaFromCommitMetadata(HoodieInstant instant, boolean includeMetadataFields) {
|
|
|
|
|
try {
|
|
|
|
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
|
|
|
|
byte[] data = timeline.getInstantDetails(instant).get();
|
|
|
|
|
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
|
|
|
|
HoodieCommitMetadata metadata = getCachedCommitMetadata(instant);
|
|
|
|
|
String existingSchemaStr = metadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
|
|
|
|
|
|
|
|
|
|
if (StringUtils.isNullOrEmpty(existingSchemaStr)) {
|
|
|
|
|
@@ -332,7 +223,7 @@ public class TableSchemaResolver {
|
|
|
|
|
|
|
|
|
|
Schema schema = new Schema.Parser().parse(existingSchemaStr);
|
|
|
|
|
if (includeMetadataFields) {
|
|
|
|
|
schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField);
|
|
|
|
|
schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField.get());
|
|
|
|
|
}
|
|
|
|
|
return Option.of(schema);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
@@ -341,23 +232,41 @@ public class TableSchemaResolver {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert a parquet scheme to the avro format.
|
|
|
|
|
*
|
|
|
|
|
* @param parquetSchema The parquet schema to convert
|
|
|
|
|
* @return The converted avro schema
|
|
|
|
|
* Fetches the schema for a table from any the table's data files
|
|
|
|
|
*/
|
|
|
|
|
public Schema convertParquetSchemaToAvro(MessageType parquetSchema) {
|
|
|
|
|
private MessageType getTableParquetSchemaFromDataFile() {
|
|
|
|
|
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = getLatestCommitMetadataWithValidData();
|
|
|
|
|
try {
|
|
|
|
|
switch (metaClient.getTableType()) {
|
|
|
|
|
case COPY_ON_WRITE:
|
|
|
|
|
case MERGE_ON_READ:
|
|
|
|
|
// For COW table, data could be written in either Parquet or Orc format currently;
|
|
|
|
|
// For MOR table, data could be written in either Parquet, Orc, Hfile or Delta-log format currently;
|
|
|
|
|
//
|
|
|
|
|
// Determine the file format based on the file name, and then extract schema from it.
|
|
|
|
|
if (instantAndCommitMetadata.isPresent()) {
|
|
|
|
|
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
|
|
|
|
|
Iterator<String> filePaths = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePathV2()).values().iterator();
|
|
|
|
|
return fetchSchemaFromFiles(filePaths);
|
|
|
|
|
} else {
|
|
|
|
|
throw new IllegalArgumentException("Could not find any data file written for commit, "
|
|
|
|
|
+ "so could not get schema for table " + metaClient.getBasePath());
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
LOG.error("Unknown table type " + metaClient.getTableType());
|
|
|
|
|
throw new InvalidTableException(metaClient.getBasePath());
|
|
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
throw new HoodieException("Failed to read data schema", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private Schema convertParquetSchemaToAvro(MessageType parquetSchema) {
|
|
|
|
|
AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf());
|
|
|
|
|
return avroSchemaConverter.convert(parquetSchema);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert a avro scheme to the parquet format.
|
|
|
|
|
*
|
|
|
|
|
* @param schema The avro schema to convert
|
|
|
|
|
* @return The converted parquet schema
|
|
|
|
|
*/
|
|
|
|
|
public MessageType convertAvroSchemaToParquet(Schema schema) {
|
|
|
|
|
private MessageType convertAvroSchemaToParquet(Schema schema) {
|
|
|
|
|
AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(metaClient.getHadoopConf());
|
|
|
|
|
return avroSchemaConverter.convert(schema);
|
|
|
|
|
}
|
|
|
|
|
@@ -482,10 +391,7 @@ public class TableSchemaResolver {
|
|
|
|
|
return latestSchema;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read the parquet schema from a parquet File.
|
|
|
|
|
*/
|
|
|
|
|
public MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException {
|
|
|
|
|
private MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException {
|
|
|
|
|
LOG.info("Reading schema from " + parquetFilePath);
|
|
|
|
|
|
|
|
|
|
FileSystem fs = metaClient.getRawFs();
|
|
|
|
|
@@ -494,35 +400,27 @@ public class TableSchemaResolver {
|
|
|
|
|
return fileFooter.getFileMetaData().getSchema();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read the parquet schema from a HFile.
|
|
|
|
|
*/
|
|
|
|
|
public MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException {
|
|
|
|
|
private MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException {
|
|
|
|
|
LOG.info("Reading schema from " + hFilePath);
|
|
|
|
|
|
|
|
|
|
FileSystem fs = metaClient.getRawFs();
|
|
|
|
|
CacheConfig cacheConfig = new CacheConfig(fs.getConf());
|
|
|
|
|
HoodieHFileReader<IndexedRecord> hFileReader = new HoodieHFileReader<>(fs.getConf(), hFilePath, cacheConfig);
|
|
|
|
|
|
|
|
|
|
return convertAvroSchemaToParquet(hFileReader.getSchema());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read the parquet schema from a ORC file.
|
|
|
|
|
*/
|
|
|
|
|
public MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException {
|
|
|
|
|
private MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException {
|
|
|
|
|
LOG.info("Reading schema from " + orcFilePath);
|
|
|
|
|
|
|
|
|
|
FileSystem fs = metaClient.getRawFs();
|
|
|
|
|
HoodieOrcReader<IndexedRecord> orcReader = new HoodieOrcReader<>(fs.getConf(), orcFilePath);
|
|
|
|
|
|
|
|
|
|
return convertAvroSchemaToParquet(orcReader.getSchema());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read schema from a data file from the last compaction commit done.
|
|
|
|
|
* @throws Exception
|
|
|
|
|
*
|
|
|
|
|
* @deprecated please use {@link #getTableAvroSchema(HoodieInstant, boolean)} instead
|
|
|
|
|
*/
|
|
|
|
|
public MessageType readSchemaFromLastCompaction(Option<HoodieInstant> lastCompactionCommitOpt) throws Exception {
|
|
|
|
|
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
|
|
|
|
|
@@ -533,18 +431,13 @@ public class TableSchemaResolver {
|
|
|
|
|
// Read from the compacted file wrote
|
|
|
|
|
HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata
|
|
|
|
|
.fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
|
|
|
|
|
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
|
|
|
|
|
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePathV2()).values().stream().findAny()
|
|
|
|
|
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
|
|
|
|
|
+ lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
|
|
|
|
|
return readSchemaFromBaseFile(filePath);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read the schema from the log file on path.
|
|
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public MessageType readSchemaFromLogFile(Path path) throws IOException {
|
|
|
|
|
private MessageType readSchemaFromLogFile(Path path) throws IOException {
|
|
|
|
|
return readSchemaFromLogFile(metaClient.getRawFs(), path);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -566,20 +459,6 @@ public class TableSchemaResolver {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isHasOperationField() {
|
|
|
|
|
return hasOperationField;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private boolean hasOperationField() {
|
|
|
|
|
try {
|
|
|
|
|
Schema tableAvroSchema = getTableAvroSchemaFromDataFile();
|
|
|
|
|
return tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null;
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
LOG.info(String.format("Failed to read operation field from avro schema (%s)", e.getMessage()));
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets the InternalSchema for a hoodie table from the HoodieCommitMetadata of the instant.
|
|
|
|
|
*
|
|
|
|
|
@@ -587,11 +466,7 @@ public class TableSchemaResolver {
|
|
|
|
|
*/
|
|
|
|
|
public Option<InternalSchema> getTableInternalSchemaFromCommitMetadata() {
|
|
|
|
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
|
|
|
|
if (timeline.lastInstant().isPresent()) {
|
|
|
|
|
return getTableInternalSchemaFromCommitMetadata(timeline.lastInstant().get());
|
|
|
|
|
} else {
|
|
|
|
|
return Option.empty();
|
|
|
|
|
}
|
|
|
|
|
return timeline.lastInstant().flatMap(this::getTableInternalSchemaFromCommitMetadata);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -601,9 +476,7 @@ public class TableSchemaResolver {
|
|
|
|
|
*/
|
|
|
|
|
private Option<InternalSchema> getTableInternalSchemaFromCommitMetadata(HoodieInstant instant) {
|
|
|
|
|
try {
|
|
|
|
|
HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedInstants();
|
|
|
|
|
byte[] data = timeline.getInstantDetails(instant).get();
|
|
|
|
|
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
|
|
|
|
HoodieCommitMetadata metadata = getCachedCommitMetadata(instant);
|
|
|
|
|
String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
|
|
|
|
|
if (latestInternalSchemaStr != null) {
|
|
|
|
|
return SerDeHelper.fromJson(latestInternalSchemaStr);
|
|
|
|
|
@@ -626,4 +499,128 @@ public class TableSchemaResolver {
|
|
|
|
|
String result = manager.getHistorySchemaStr();
|
|
|
|
|
return result.isEmpty() ? Option.empty() : Option.of(result);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* NOTE: This method could only be used in tests
|
|
|
|
|
*
|
|
|
|
|
* @VisibleForTesting
|
|
|
|
|
*/
|
|
|
|
|
public boolean hasOperationField() {
|
|
|
|
|
try {
|
|
|
|
|
Schema tableAvroSchema = getTableAvroSchemaFromDataFile();
|
|
|
|
|
return tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null;
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
LOG.info(String.format("Failed to read operation field from avro schema (%s)", e.getMessage()));
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private Option<Pair<HoodieInstant, HoodieCommitMetadata>> getLatestCommitMetadataWithValidSchema() {
|
|
|
|
|
if (latestCommitWithValidSchema == null) {
|
|
|
|
|
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
|
|
|
|
|
metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema();
|
|
|
|
|
if (instantAndCommitMetadata.isPresent()) {
|
|
|
|
|
HoodieInstant instant = instantAndCommitMetadata.get().getLeft();
|
|
|
|
|
HoodieCommitMetadata metadata = instantAndCommitMetadata.get().getRight();
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
if (latestCommitWithValidSchema == null) {
|
|
|
|
|
latestCommitWithValidSchema = instant;
|
|
|
|
|
}
|
|
|
|
|
commitMetadataCache.get().putIfAbsent(instant, metadata);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Option.ofNullable(latestCommitWithValidSchema)
|
|
|
|
|
.map(instant -> Pair.of(instant, commitMetadataCache.get().get(instant)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private Option<Pair<HoodieInstant, HoodieCommitMetadata>> getLatestCommitMetadataWithValidData() {
|
|
|
|
|
if (latestCommitWithValidData == null) {
|
|
|
|
|
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
|
|
|
|
|
metaClient.getActiveTimeline().getLastCommitMetadataWithValidData();
|
|
|
|
|
if (instantAndCommitMetadata.isPresent()) {
|
|
|
|
|
HoodieInstant instant = instantAndCommitMetadata.get().getLeft();
|
|
|
|
|
HoodieCommitMetadata metadata = instantAndCommitMetadata.get().getRight();
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
if (latestCommitWithValidData == null) {
|
|
|
|
|
latestCommitWithValidData = instant;
|
|
|
|
|
}
|
|
|
|
|
commitMetadataCache.get().putIfAbsent(instant, metadata);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Option.ofNullable(latestCommitWithValidData)
|
|
|
|
|
.map(instant -> Pair.of(instant, commitMetadataCache.get().get(instant)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private HoodieCommitMetadata getCachedCommitMetadata(HoodieInstant instant) {
|
|
|
|
|
return commitMetadataCache.get()
|
|
|
|
|
.computeIfAbsent(instant, (missingInstant) -> {
|
|
|
|
|
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
|
|
|
|
byte[] data = timeline.getInstantDetails(missingInstant).get();
|
|
|
|
|
try {
|
|
|
|
|
return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class);
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
throw new HoodieIOException(String.format("Failed to fetch HoodieCommitMetadata for instant (%s)", missingInstant), e);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private MessageType fetchSchemaFromFiles(Iterator<String> filePaths) throws IOException {
|
|
|
|
|
MessageType type = null;
|
|
|
|
|
while (filePaths.hasNext() && type == null) {
|
|
|
|
|
String filePath = filePaths.next();
|
|
|
|
|
if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
|
|
|
|
|
// this is a log file
|
|
|
|
|
type = readSchemaFromLogFile(new Path(filePath));
|
|
|
|
|
} else {
|
|
|
|
|
type = readSchemaFromBaseFile(filePath);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return type;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private MessageType readSchemaFromBaseFile(String filePath) throws IOException {
|
|
|
|
|
if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
|
|
|
|
|
return readSchemaFromParquetBaseFile(new Path(filePath));
|
|
|
|
|
} else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) {
|
|
|
|
|
return readSchemaFromHFileBaseFile(new Path(filePath));
|
|
|
|
|
} else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) {
|
|
|
|
|
return readSchemaFromORCBaseFile(new Path(filePath));
|
|
|
|
|
} else {
|
|
|
|
|
throw new IllegalArgumentException("Unknown base file format :" + filePath);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static Schema appendPartitionColumns(Schema dataSchema, String[] partitionFields) {
|
|
|
|
|
// In cases when {@link DROP_PARTITION_COLUMNS} config is set true, partition columns
|
|
|
|
|
// won't be persisted w/in the data files, and therefore we need to append such columns
|
|
|
|
|
// when schema is parsed from data files
|
|
|
|
|
//
|
|
|
|
|
// Here we append partition columns with {@code StringType} as the data type
|
|
|
|
|
if (partitionFields.length == 0) {
|
|
|
|
|
return dataSchema;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
boolean hasPartitionColNotInSchema = Arrays.stream(partitionFields).anyMatch(pf -> !containsFieldInSchema(dataSchema, pf));
|
|
|
|
|
boolean hasPartitionColInSchema = Arrays.stream(partitionFields).anyMatch(pf -> containsFieldInSchema(dataSchema, pf));
|
|
|
|
|
if (hasPartitionColNotInSchema && hasPartitionColInSchema) {
|
|
|
|
|
throw new HoodieIncompatibleSchemaException("Partition columns could not be partially contained w/in the data schema");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hasPartitionColNotInSchema) {
|
|
|
|
|
// when hasPartitionColNotInSchema is true and hasPartitionColInSchema is false, all partition columns
|
|
|
|
|
// are not in originSchema. So we create and add them.
|
|
|
|
|
List<Field> newFields = new ArrayList<>();
|
|
|
|
|
for (String partitionField: partitionFields) {
|
|
|
|
|
newFields.add(new Schema.Field(
|
|
|
|
|
partitionField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE));
|
|
|
|
|
}
|
|
|
|
|
return appendFieldsToSchema(dataSchema, newFields);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dataSchema;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|