1
0

[RFC-33] [HUDI-2429][Stacked on HUDI-2560] Support full Schema evolution for Spark (#4910)

* [HUDI-2560] introduce id_based schema to support full schema evolution.

* add test for FileBasedInternalSchemaStorageManger and rebase code

* add support for change column type and fix some test case

* fix some bugs encountered in the production env and delete useless code

* fix test error

* rebase code

* fixed some nested schema change bugs

* [HUDI-2429][Stacked On HUDI-2560]Support full schema evolution for spark

* [use dummyInternalSchema instead of null]

* add support for spark3.1.x

* remove support for spark3.1.x , sicne some compile fail

* support spark3.1.x

* rebase and prepare solve all comments

* address all comments

* rebase code

* fixed the count(*) bug

* try to get internalSchema by parser commit file/history file directly, not use metaclient which is time cost
address some comments

* fixed all comments

* fix new comments

* rebase code,fix UT failed

* fixed mistake

* rebase code ,fixed new comments

* rebase code , and prepare for address new comments

* address commits

* address new comments

* fix new issues

* control fallback original write logical
This commit is contained in:
xiarixiaoyao
2022-04-02 04:20:24 +08:00
committed by GitHub
parent 9275b8fc7e
commit 444ff496a4
89 changed files with 10352 additions and 106 deletions

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.client;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.async.AsyncArchiveService;
import org.apache.hudi.async.AsyncCleanerService;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.avro.model.HoodieClusteringPlan;
@@ -49,6 +50,7 @@ import org.apache.hudi.common.model.TableServiceType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.HoodieTableVersion;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
@@ -71,6 +73,15 @@ import org.apache.hudi.exception.HoodieRollbackException;
import org.apache.hudi.exception.HoodieSavepointException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.action.InternalSchemaChangeApplier;
import org.apache.hudi.internal.schema.action.TableChange;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
import org.apache.hudi.metadata.MetadataPartitionType;
import org.apache.hudi.metrics.HoodieMetrics;
@@ -85,6 +96,7 @@ import org.apache.hudi.table.upgrade.UpgradeDowngrade;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import org.apache.avro.Schema;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -101,6 +113,8 @@ import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY;
/**
* Abstract Write Client providing functionality for performing commit, index updates and rollback
* Reused for regular write operations like upsert/insert/bulk-insert.. as well as bootstrap
@@ -246,12 +260,42 @@ public abstract class BaseHoodieWriteClient<T extends HoodieRecordPayload, I, K,
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
// Finalize write
finalizeWrite(table, instantTime, stats);
// do save internal schema to support Implicitly add columns in write process
if (!metadata.getExtraMetadata().containsKey(SerDeHelper.LATEST_SCHEMA)
&& metadata.getExtraMetadata().containsKey(SCHEMA_KEY) && table.getConfig().getSchemaEvolutionEnable()) {
saveInternalSchema(table, instantTime, metadata);
}
// update Metadata table
writeTableMetadata(table, instantTime, commitActionType, metadata);
activeTimeline.saveAsComplete(new HoodieInstant(true, commitActionType, instantTime),
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
}
// Save internal schema
private void saveInternalSchema(HoodieTable table, String instantTime, HoodieCommitMetadata metadata) {
TableSchemaResolver schemaUtil = new TableSchemaResolver(table.getMetaClient());
String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse("");
FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(table.getMetaClient());
if (!historySchemaStr.isEmpty()) {
InternalSchema internalSchema = InternalSchemaUtils.searchSchema(Long.parseLong(instantTime),
SerDeHelper.parseSchemas(historySchemaStr));
Schema avroSchema = HoodieAvroUtils.createHoodieWriteSchema(new Schema.Parser().parse(config.getSchema()));
InternalSchema evolvedSchema = AvroSchemaEvolutionUtils.evolveSchemaFromNewAvroSchema(avroSchema, internalSchema);
if (evolvedSchema.equals(internalSchema)) {
metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, SerDeHelper.toJson(evolvedSchema));
//TODO save history schema by metaTable
schemasManager.persistHistorySchemaStr(instantTime, historySchemaStr);
} else {
evolvedSchema.setSchemaId(Long.parseLong(instantTime));
String newSchemaStr = SerDeHelper.toJson(evolvedSchema);
metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, newSchemaStr);
schemasManager.persistHistorySchemaStr(instantTime, SerDeHelper.inheritSchemas(evolvedSchema, historySchemaStr));
}
// update SCHEMA_KEY
metadata.addMetadata(SCHEMA_KEY, AvroInternalSchemaConverter.convert(evolvedSchema, avroSchema.getName()).toString());
}
}
protected HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf) {
return createTable(config, hadoopConf, false);
}
@@ -1442,8 +1486,8 @@ public abstract class BaseHoodieWriteClient<T extends HoodieRecordPayload, I, K,
if (lastInstant.isPresent()) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
activeTimeline.getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class);
if (commitMetadata.getExtraMetadata().containsKey(HoodieCommitMetadata.SCHEMA_KEY)) {
config.setSchema(commitMetadata.getExtraMetadata().get(HoodieCommitMetadata.SCHEMA_KEY));
if (commitMetadata.getExtraMetadata().containsKey(SCHEMA_KEY)) {
config.setSchema(commitMetadata.getExtraMetadata().get(SCHEMA_KEY));
} else {
throw new HoodieIOException("Latest commit does not have any schema in commit metadata");
}
@@ -1505,4 +1549,138 @@ public abstract class BaseHoodieWriteClient<T extends HoodieRecordPayload, I, K,
metaClient.reloadActiveTimeline();
}
}
/**
* add columns to table.
*
* @param colName col name to be added. if we want to add col to a nested filed, the fullName should be specify
* @param schema col type to be added.
* @param doc col doc to be added.
* @param position col position to be added
* @param positionType col position change type. now support three change types: first/after/before
*/
public void addColumn(String colName, Schema schema, String doc, String position, TableChange.ColumnPositionChange.ColumnPositionType positionType) {
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft())
.applyAddChange(colName, AvroInternalSchemaConverter.convertToField(schema), doc, position, positionType);
commitTableChange(newSchema, pair.getRight());
}
public void addColumn(String colName, Schema schema) {
addColumn(colName, schema, null, "", TableChange.ColumnPositionChange.ColumnPositionType.NO_OPERATION);
}
/**
* delete columns to table.
*
* @param colNames col name to be deleted. if we want to delete col from a nested filed, the fullName should be specify
*/
public void deleteColumns(String... colNames) {
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyDeleteChange(colNames);
commitTableChange(newSchema, pair.getRight());
}
/**
* rename col name for hudi table.
*
* @param colName col name to be renamed. if we want to rename col from a nested filed, the fullName should be specify
* @param newName new name for current col. no need to specify fullName.
*/
public void renameColumn(String colName, String newName) {
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyRenameChange(colName, newName);
commitTableChange(newSchema, pair.getRight());
}
/**
* update col nullable attribute for hudi table.
*
* @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify
* @param nullable .
*/
public void updateColumnNullability(String colName, boolean nullable) {
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnNullabilityChange(colName, nullable);
commitTableChange(newSchema, pair.getRight());
}
/**
* update col Type for hudi table.
* only support update primitive type to primitive type.
* cannot update nest type to nest type or primitive type eg: RecordType -> MapType, MapType -> LongType.
*
* @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify
* @param newType .
*/
public void updateColumnType(String colName, Type newType) {
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnTypeChange(colName, newType);
commitTableChange(newSchema, pair.getRight());
}
/**
* update col comment for hudi table.
*
* @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify
* @param doc .
*/
public void updateColumnComment(String colName, String doc) {
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnCommentChange(colName, doc);
commitTableChange(newSchema, pair.getRight());
}
/**
* reorder the position of col.
*
* @param colName column which need to be reordered. if we want to change col from a nested filed, the fullName should be specify.
* @param referColName reference position.
* @param orderType col position change type. now support three change types: first/after/before
*/
public void reOrderColPosition(String colName, String referColName, TableChange.ColumnPositionChange.ColumnPositionType orderType) {
if (colName == null || orderType == null || referColName == null) {
return;
}
//get internalSchema
Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient();
InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft())
.applyReOrderColPositionChange(colName, referColName, orderType);
commitTableChange(newSchema, pair.getRight());
}
private Pair<InternalSchema, HoodieTableMetaClient> getInternalSchemaAndMetaClient() {
HoodieTableMetaClient metaClient = createMetaClient(true);
TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
Option<InternalSchema> internalSchemaOption = schemaUtil.getTableInternalSchemaFromCommitMetadata();
if (!internalSchemaOption.isPresent()) {
throw new HoodieException(String.format("cannot find schema for current table: %s", config.getBasePath()));
}
return Pair.of(internalSchemaOption.get(), metaClient);
}
private void commitTableChange(InternalSchema newSchema, HoodieTableMetaClient metaClient) {
TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse("");
Schema schema = AvroInternalSchemaConverter.convert(newSchema, config.getTableName());
String commitActionType = CommitUtils.getCommitActionType(WriteOperationType.ALTER_SCHEMA, metaClient.getTableType());
String instantTime = HoodieActiveTimeline.createNewInstantTime();
startCommitWithTime(instantTime, commitActionType, metaClient);
config.setSchema(schema.toString());
HoodieActiveTimeline timeLine = metaClient.getActiveTimeline();
HoodieInstant requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime);
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
metadata.setOperationType(WriteOperationType.ALTER_SCHEMA);
try {
timeLine.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException io) {
throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", io);
}
Map<String, String> extraMeta = new HashMap<>();
extraMeta.put(SerDeHelper.LATEST_SCHEMA, SerDeHelper.toJson(newSchema.setSchemaId(Long.getLong(instantTime))));
// try to save history schemas
FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient);
schemasManager.persistHistorySchemaStr(instantTime, SerDeHelper.inheritSchemas(newSchema, historySchemaStr));
commitStats(instantTime, Collections.EMPTY_LIST, Option.of(extraMeta), commitActionType);
}
}

View File

@@ -167,6 +167,22 @@ public class HoodieWriteConfig extends HoodieConfig {
+ "implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema "
+ "evolving records during an update.");
public static final ConfigProperty<String> INTERNAL_SCHEMA_STRING = ConfigProperty
.key("hoodie.internal.schema")
.noDefaultValue()
.withDocumentation("Schema string representing the latest schema of the table. Hudi passes this to "
+ "implementations of evolution of schema");
public static final ConfigProperty<Boolean> SCHEMA_EVOLUTION_ENABLE = ConfigProperty
.key("hoodie.schema.on.read.enable")
.defaultValue(false)
.withDocumentation("enable full schema evolution for hoodie");
public static final ConfigProperty<Boolean> ENABLE_INTERNAL_SCHEMA_CACHE = ConfigProperty
.key("hoodie.schema.cache.enable")
.defaultValue(false)
.withDocumentation("cache query internalSchemas in driver/executor side");
public static final ConfigProperty<String> AVRO_SCHEMA_VALIDATE_ENABLE = ConfigProperty
.key("hoodie.avro.schema.validate")
.defaultValue("false")
@@ -886,6 +902,30 @@ public class HoodieWriteConfig extends HoodieConfig {
setValue(AVRO_SCHEMA_STRING, schemaStr);
}
public String getInternalSchema() {
return getString(INTERNAL_SCHEMA_STRING);
}
public boolean getInternalSchemaCacheEnable() {
return getBoolean(ENABLE_INTERNAL_SCHEMA_CACHE);
}
public void setInternalSchemaString(String internalSchemaString) {
setValue(INTERNAL_SCHEMA_STRING, internalSchemaString);
}
public void setInternalSchemaCacheEnable(boolean enable) {
setValue(ENABLE_INTERNAL_SCHEMA_CACHE, String.valueOf(enable));
}
public boolean getSchemaEvolutionEnable() {
return getBoolean(SCHEMA_EVOLUTION_ENABLE);
}
public void setSchemaEvolutionEnable(boolean enable) {
setValue(SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable));
}
/**
* Get the write schema for written records.
*
@@ -2075,6 +2115,16 @@ public class HoodieWriteConfig extends HoodieConfig {
return this;
}
public Builder withSchemaEvolutionEnable(boolean enable) {
writeConfig.setValue(SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable));
return this;
}
public Builder withInternalSchemaCacheEnable(boolean enable) {
writeConfig.setValue(ENABLE_INTERNAL_SCHEMA_CACHE, String.valueOf(enable));
return this;
}
public Builder withAvroSchemaValidate(boolean enable) {
writeConfig.setValue(AVRO_SCHEMA_VALIDATE_ENABLE, String.valueOf(enable));
return this;

View File

@@ -39,6 +39,7 @@ import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.BaseActionExecutor;
@@ -240,6 +241,14 @@ public class CleanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends
List<HoodieInstant> pendingCleanInstants = table.getCleanTimeline()
.filterInflightsAndRequested().getInstants().collect(Collectors.toList());
if (pendingCleanInstants.size() > 0) {
// try to clean old history schema.
try {
FileBasedInternalSchemaStorageManager fss = new FileBasedInternalSchemaStorageManager(table.getMetaClient());
fss.cleanOldFiles(pendingCleanInstants.stream().map(is -> is.getTimestamp()).collect(Collectors.toList()));
} catch (Exception e) {
// we should not affect original clean logic. Swallow exception and log warn.
LOG.warn("failed to clean old history schema");
}
pendingCleanInstants.forEach(hoodieInstant -> {
if (table.getCleanTimeline().isEmpty(hoodieInstant)) {
table.getActiveTimeline().deleteEmptyInstantIfExists(hoodieInstant);

View File

@@ -18,14 +18,24 @@
package org.apache.hudi.table.action.commit;
import org.apache.avro.SchemaCompatibility;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.InternalSchemaCache;
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.action.InternalSchemaMerger;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
@@ -41,6 +51,8 @@ import org.apache.hadoop.conf.Configuration;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
public class HoodieMergeHelper<T extends HoodieRecordPayload> extends
BaseMergeHelper<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> {
@@ -78,12 +90,41 @@ public class HoodieMergeHelper<T extends HoodieRecordPayload> extends
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
Option<InternalSchema> querySchemaOpt = SerDeHelper.fromJson(table.getConfig().getInternalSchema());
boolean needToReWriteRecord = false;
// TODO support bootstrap
if (querySchemaOpt.isPresent() && !baseFile.getBootstrapBaseFile().isPresent()) {
// check implicitly add columns, and position reorder(spark sql may change cols order)
InternalSchema querySchema = AvroSchemaEvolutionUtils.evolveSchemaFromNewAvroSchema(readSchema, querySchemaOpt.get(), true);
long commitInstantTime = Long.valueOf(FSUtils.getCommitTime(mergeHandle.getOldFilePath().getName()));
InternalSchema writeInternalSchema = InternalSchemaCache.searchSchemaAndCache(commitInstantTime, table.getMetaClient(), table.getConfig().getInternalSchemaCacheEnable());
if (writeInternalSchema.isEmptySchema()) {
throw new HoodieException(String.format("cannot find file schema for current commit %s", commitInstantTime));
}
List<String> colNamesFromQuerySchema = querySchema.getAllColsFullName();
List<String> colNamesFromWriteSchema = writeInternalSchema.getAllColsFullName();
List<String> sameCols = colNamesFromWriteSchema.stream()
.filter(f -> colNamesFromQuerySchema.contains(f)
&& writeInternalSchema.findIdByName(f) == querySchema.findIdByName(f)
&& writeInternalSchema.findIdByName(f) != -1
&& writeInternalSchema.findType(writeInternalSchema.findIdByName(f)).equals(querySchema.findType(writeInternalSchema.findIdByName(f)))).collect(Collectors.toList());
readSchema = AvroInternalSchemaConverter.convert(new InternalSchemaMerger(writeInternalSchema, querySchema, true, false).mergeSchema(), readSchema.getName());
Schema writeSchemaFromFile = AvroInternalSchemaConverter.convert(writeInternalSchema, readSchema.getName());
needToReWriteRecord = sameCols.size() != colNamesFromWriteSchema.size()
|| SchemaCompatibility.checkReaderWriterCompatibility(writeSchemaFromFile, readSchema).getType() == org.apache.avro.SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE;
}
try {
final Iterator<GenericRecord> readerIterator;
if (baseFile.getBootstrapBaseFile().isPresent()) {
readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
} else {
readerIterator = reader.getRecordIterator(readSchema);
if (needToReWriteRecord) {
readerIterator = HoodieAvroUtils.rewriteRecordWithNewSchema(reader.getRecordIterator(), readSchema);
} else {
readerIterator = reader.getRecordIterator(readSchema);
}
}
ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();

View File

@@ -44,9 +44,12 @@ import org.apache.hudi.common.table.view.TableFileSystemView.SliceView;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.io.IOUtils;
import org.apache.hudi.table.HoodieCompactionHandler;
import org.apache.hudi.table.HoodieTable;
@@ -117,8 +120,10 @@ public abstract class HoodieCompactor<T extends HoodieRecordPayload, I, K, O> im
// log file.That is because in the case of MergeInto, the config.getSchema may not
// the same with the table schema.
try {
Schema readerSchema = schemaResolver.getTableAvroSchema(false);
config.setSchema(readerSchema.toString());
if (StringUtils.isNullOrEmpty(config.getInternalSchema())) {
Schema readerSchema = schemaResolver.getTableAvroSchema(false);
config.setSchema(readerSchema.toString());
}
} catch (Exception e) {
// If there is no commit in the table, just ignore the exception.
}
@@ -145,9 +150,17 @@ public abstract class HoodieCompactor<T extends HoodieRecordPayload, I, K, O> im
String instantTime,
TaskContextSupplier taskContextSupplier) throws IOException {
FileSystem fs = metaClient.getFs();
Schema readerSchema = HoodieAvroUtils.addMetadataFields(
new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
Schema readerSchema;
Option<InternalSchema> internalSchemaOption = Option.empty();
if (!StringUtils.isNullOrEmpty(config.getInternalSchema())) {
readerSchema = new Schema.Parser().parse(config.getSchema());
internalSchemaOption = SerDeHelper.fromJson(config.getInternalSchema());
// its safe to modify config here, since we running in task side.
((HoodieTable) compactionHandler).getConfig().setDefault(config);
} else {
readerSchema = HoodieAvroUtils.addMetadataFields(
new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
}
LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames()
+ " for commit " + instantTime);
// TODO - FIX THIS
@@ -172,6 +185,7 @@ public abstract class HoodieCompactor<T extends HoodieRecordPayload, I, K, O> im
.withLogFilePaths(logFiles)
.withReaderSchema(readerSchema)
.withLatestInstantTime(maxInstantTime)
.withInternalSchema(internalSchemaOption.orElse(InternalSchema.getEmptyInternalSchema()))
.withMaxMemorySizeInBytes(maxMemoryPerCompaction)
.withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled())
.withReverseReader(config.getCompactionReverseLogReadEnabled())

View File

@@ -30,8 +30,11 @@ import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.InternalSchemaCache;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCompactionException;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.table.HoodieCompactionHandler;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.BaseActionExecutor;
@@ -70,8 +73,19 @@ public class RunCompactionActionExecutor<T extends HoodieRecordPayload> extends
HoodieCompactionPlan compactionPlan =
CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime);
// try to load internalSchema to support schema Evolution
HoodieWriteConfig configCopy = config;
Pair<Option<String>, Option<String>> schemaPair = InternalSchemaCache
.getInternalSchemaAndAvroSchemaForClusteringAndCompaction(table.getMetaClient(), instantTime);
if (schemaPair.getLeft().isPresent() && schemaPair.getRight().isPresent()) {
// should not influence the original config, just copy it
configCopy = HoodieWriteConfig.newBuilder().withProperties(config.getProps()).build();
configCopy.setInternalSchemaString(schemaPair.getLeft().get());
configCopy.setSchema(schemaPair.getRight().get());
}
HoodieData<WriteStatus> statuses = compactor.compact(
context, compactionPlan, table, config, instantTime, compactionHandler);
context, compactionPlan, table, configCopy, instantTime, compactionHandler);
compactor.maybePersist(statuses, config);
context.setJobStatus(this.getClass().getSimpleName(), "Preparing compaction metadata");
@@ -81,7 +95,10 @@ public class RunCompactionActionExecutor<T extends HoodieRecordPayload> extends
metadata.addWriteStat(stat.getPartitionPath(), stat);
}
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema());
if (schemaPair.getLeft().isPresent()) {
metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, schemaPair.getLeft().get());
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schemaPair.getRight().get());
}
compactionMetadata.setWriteStatuses(statuses);
compactionMetadata.setCommitted(false);
compactionMetadata.setCommitMetadata(Option.of(metadata));