1
0

[HUDI-2903] get table schema from the last commit with data written (#4180)

This commit is contained in:
Yann Byron
2022-01-18 23:50:30 +08:00
committed by GitHub
parent 45f054ffde
commit a09c231911
5 changed files with 288 additions and 108 deletions

View File

@@ -21,8 +21,10 @@ package org.apache.hudi.common.table;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.SchemaCompatibility;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
@@ -40,8 +42,10 @@ import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
@@ -73,72 +77,41 @@ public class TableSchemaResolver {
* commit. We will assume that the schema has not changed within a single atomic write.
*
* @return Parquet schema for this table
* @throws Exception
*/
private MessageType getTableParquetSchemaFromDataFile() throws Exception {
private MessageType getTableParquetSchemaFromDataFile() {
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
activeTimeline.getLastCommitMetadataWithValidData();
try {
switch (metaClient.getTableType()) {
case COPY_ON_WRITE:
// If this is COW, get the last commit and read the schema from a file written in the
// last commit
HoodieInstant lastCommit =
activeTimeline.getCommitsTimeline().filterCompletedInstantsWithCommitMetadata()
.lastInstant().orElseThrow(() -> new InvalidTableException(metaClient.getBasePath()));
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit "
+ lastCommit + ", could not get schema for table " + metaClient.getBasePath() + ", Metadata :"
+ commitMetadata));
return readSchemaFromBaseFile(new Path(filePath));
case MERGE_ON_READ:
// If this is MOR, depending on whether the latest commit is a delta commit or
// compaction commit
// Get a datafile written and get the schema from that file
Option<HoodieInstant> lastCompactionCommit = metaClient.getActiveTimeline().getCommitTimeline()
.filterCompletedInstantsWithCommitMetadata().lastInstant();
LOG.info("Found the last compaction commit as " + lastCompactionCommit);
Option<HoodieInstant> lastDeltaCommit;
if (lastCompactionCommit.isPresent()) {
lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants()
.findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE).lastInstant();
// For COW table, the file has data written must be in parquet format currently.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
return readSchemaFromBaseFile(new Path(filePath));
} else {
lastDeltaCommit =
metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant();
throw new IllegalArgumentException("Could not find any data file written for commit, "
+ "so could not get schema for table " + metaClient.getBasePath());
}
LOG.info("Found the last delta commit " + lastDeltaCommit);
if (lastDeltaCommit.isPresent()) {
HoodieInstant lastDeltaInstant = lastDeltaCommit.get();
// read from the log file wrote
commitMetadata = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(lastDeltaInstant).get(),
HoodieCommitMetadata.class);
Pair<String, HoodieFileFormat> filePathWithFormat =
commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream()
.filter(s -> s.contains(HoodieLogFile.DELTA_EXTENSION)).findAny()
.map(f -> Pair.of(f, HoodieFileFormat.HOODIE_LOG)).orElseGet(() -> {
// No Log files in Delta-Commit. Check if there are any parquet files
return commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream()
.filter(s -> s.contains((metaClient.getTableConfig().getBaseFileFormat().getFileExtension())))
.findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() ->
new IllegalArgumentException("Could not find any data file written for commit "
+ lastDeltaInstant + ", could not get schema for table " + metaClient.getBasePath()
+ ", CommitMetadata :" + commitMetadata));
});
switch (filePathWithFormat.getRight()) {
case HOODIE_LOG:
return readSchemaFromLogFile(lastCompactionCommit, new Path(filePathWithFormat.getLeft()));
case PARQUET:
return readSchemaFromBaseFile(new Path(filePathWithFormat.getLeft()));
default:
throw new IllegalArgumentException("Unknown file format :" + filePathWithFormat.getRight()
+ " for file " + filePathWithFormat.getLeft());
case MERGE_ON_READ:
// For MOR table, the file has data written may be a parquet file or .log file.
// Determine the file format based on the file name, and then extract schema from it.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
if (filePath.contains(HoodieLogFile.DELTA_EXTENSION)) {
// this is a log file
return readSchemaFromLogFile(new Path(filePath));
} else if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) {
// this is a parquet file
return readSchemaFromBaseFile(new Path(filePath));
} else {
throw new IllegalArgumentException("Unknown file format :" + filePath);
}
} else {
return readSchemaFromLastCompaction(lastCompactionCommit);
throw new IllegalArgumentException("Could not find any data file written for commit, "
+ "so could not get schema for table " + metaClient.getBasePath());
}
default:
LOG.error("Unknown table type " + metaClient.getTableType());
@@ -484,21 +457,6 @@ public class TableSchemaResolver {
return readSchemaFromLogFile(metaClient.getRawFs(), path);
}
/**
* Read the schema from the log file on path.
* @throws Exception
*/
public MessageType readSchemaFromLogFile(Option<HoodieInstant> lastCompactionCommitOpt, Path path)
throws Exception {
MessageType messageType = readSchemaFromLogFile(path);
// Fall back to read the schema from last compaction
if (messageType == null) {
LOG.info("Falling back to read the schema from last compaction " + lastCompactionCommitOpt);
return readSchemaFromLastCompaction(lastCompactionCommitOpt);
}
return messageType;
}
/**
* Read the schema from the log file on path.
*

View File

@@ -19,11 +19,13 @@
package org.apache.hudi.common.table.timeline;
import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -39,11 +41,14 @@ import java.io.Serializable;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Represents the Active Timeline for the Hoodie table. Instants for the last 12 hours (configurable) is in the
@@ -254,6 +259,26 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
return readDataFromPath(detailPath);
}
/**
* Get the last instant with valid data, and convert this to HoodieCommitMetadata
*/
public Option<Pair<HoodieInstant, HoodieCommitMetadata>> getLastCommitMetadataWithValidData() {
List<HoodieInstant> completed = getCommitsTimeline().filterCompletedInstants().getInstants()
.sorted(Comparator.comparing(HoodieInstant::getTimestamp).reversed()).collect(Collectors.toList());
for (HoodieInstant instant : completed) {
try {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
getInstantDetails(instant).get(), HoodieCommitMetadata.class);
if (!commitMetadata.getFileIdAndRelativePaths().isEmpty()) {
return Option.of(Pair.of(instant, commitMetadata));
}
} catch (IOException e) {
LOG.warn("Failed to convert instant to HoodieCommitMetadata: " + instant.toString());
}
}
return Option.empty();
}
public Option<byte[]> readCleanerInfoAsBytes(HoodieInstant instant) {
// Cleaner metadata are always stored only in timeline .hoodie
return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName()));

View File

@@ -18,8 +18,6 @@
package org.apache.hudi.common.table.timeline;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
@@ -102,12 +100,6 @@ public class HoodieDefaultTimeline implements HoodieTimeline {
return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isCompleted), details);
}
@Override
public HoodieTimeline filterCompletedInstantsWithCommitMetadata() {
return new HoodieDefaultTimeline(instants.stream().filter(HoodieInstant::isCompleted)
.filter(i -> !isDeletePartitionType(i)), details);
}
@Override
public HoodieTimeline filterCompletedAndCompactionInstants() {
return new HoodieDefaultTimeline(instants.stream().filter(s -> s.isCompleted()
@@ -359,21 +351,6 @@ public class HoodieDefaultTimeline implements HoodieTimeline {
return details.apply(instant);
}
@Override
public boolean isDeletePartitionType(HoodieInstant instant) {
Option<WriteOperationType> operationType;
try {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(getInstantDetails(instant).get(), HoodieCommitMetadata.class);
operationType = Option.of(commitMetadata.getOperationType());
} catch (Exception e) {
operationType = Option.empty();
}
return operationType.isPresent() && WriteOperationType.DELETE_PARTITION.equals(operationType.get());
}
@Override
public boolean isEmpty(HoodieInstant instant) {
return getInstantDetails(instant).get().length == 0;

View File

@@ -131,14 +131,6 @@ public interface HoodieTimeline extends Serializable {
*/
HoodieTimeline filterCompletedAndCompactionInstants();
/**
* Filter this timeline to include the completed and exclude operation type is delete partition instants.
*
* @return New instance of HoodieTimeline with include the completed and
* exclude operation type is delete partition instants
*/
HoodieTimeline filterCompletedInstantsWithCommitMetadata();
/**
* Timeline to just include commits (commit/deltacommit), compaction and replace actions.
*
@@ -291,11 +283,6 @@ public interface HoodieTimeline extends Serializable {
boolean isEmpty(HoodieInstant instant);
/**
* Check WriteOperationType is DeletePartition.
*/
boolean isDeletePartitionType(HoodieInstant instant);
/**
* Helper methods to compare instants.
**/