1
0

[HUDI-842] Implementation of HUDI RFC-15.

- Introduced an internal metadata table, that stores file listings.
 - metadata table is kept upto date with
 - Fixed handling of CleanerPlan.
 - [HUDI-842] Reduce parallelism to speed up the test.
 - [HUDI-842] Implementation of CLI commands for metadata operations and lookups.
 - [HUDI-842] Extend rollback metadata to include the files which have been appended to.
 - [HUDI-842] Support for rollbacks in MOR Table.
 - MarkerBasedRollbackStrategy needs to correctly provide the list of files for which rollback blocks were appended.
 - [HUDI-842] Added unit test for rollback of partial commits (inflight but not completed yet).
 - [HUDI-842] Handled the error case where metadata update succeeds but dataset commit fails.
 - [HUDI-842] Schema evolution strategy for Metadata Table. Each type of metadata saved (FilesystemMetadata, ColumnIndexMetadata, etc.) will be a separate field with default null. The type of the record will identify the valid field. This way, we can grow the schema when new type of information is saved within in which still keeping it backward compatible.
 - [HUDI-842] Fix non-partitioned case and speedup initial creation of metadata table.Choose only 1 partition for jsc as the number of records is low (hundreds to thousands). There is more overhead of creating large number of partitions for JavaRDD and it slows down operations like WorkloadProfile.
For the non-partitioned case, use "." as the name of the partition to prevent empty keys in HFile.
 - [HUDI-842] Reworked metrics pusblishing.
 - Code has been split into reader and writer side. HoodieMetadata code to be accessed by using HoodieTable.metadata() to get instance of metdata for the table.
Code is serializable to allow executors to use the functionality.
 - [RFC-15] Add metrics to track the time for each file system call.
 - [RFC-15] Added a distributed metrics registry for spark which can be used to collect metrics from executors. This helps create a stats dashboard which shows the metadata table improvements in real-time for production tables.
 - [HUDI-1321] Created HoodieMetadataConfig to specify configuration for the metadata table. This is safer than full-fledged properties for the metadata table (like HoodieWriteConfig) as it makes burdensome to tune the metadata. With limited configuration, we can control the performance of the metadata table closely.

[HUDI-1319][RFC-15] Adding interfaces for HoodieMetadata, HoodieMetadataWriter (apache#2266)
 - moved MetadataReader to HoodieBackedTableMetadata, under the HoodieTableMetadata interface
 - moved MetadataWriter to HoodieBackedTableMetadataWriter, under the HoodieTableMetadataWriter
 - Pulled all the metrics into HoodieMetadataMetrics
 - Writer now wraps the metadata, instead of extending it
 - New enum for MetadataPartitionType
 - Streamlined code flow inside HoodieBackedTableMetadataWriter w.r.t initializing metadata state
 - [HUDI-1319] Make async operations work with metadata table (apache#2332)
 - Changes the syncing model to only move over completed instants on data timeline
 - Syncing happens postCommit and on writeClient initialization
 - Latest delta commit on the metadata table is sufficient as the watermark for data timeline archival
 - Cleaning/Compaction use a suffix to the last instant written to metadata table, such that we keep the 1-1
 - .. mapping between data and metadata timelines.
 - Got rid of a lot of the complexity around checking for valid commits during open of base/log files
 - Tests now use local FS, to simulate more failure scenarios
 - Some failure scenarios exposed HUDI-1434, which is needed for MOR to work correctly

co-authored by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
Prashant Wason
2020-12-30 18:29:55 -08:00
committed by vinoth chandar
parent c3e9243ea1
commit 298808baaf
45 changed files with 4060 additions and 242 deletions

View File

@@ -23,6 +23,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.client.common.HoodieEngineContext;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
@@ -41,6 +43,8 @@ import org.apache.hudi.exception.HoodieClusteringException;
import org.apache.hudi.exception.HoodieCommitException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.SparkHoodieIndex;
import org.apache.hudi.metrics.DistributedRegistry;
import org.apache.hudi.metrics.SparkHoodieBackedTableMetadataWriter;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
@@ -51,6 +55,7 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
@@ -136,7 +141,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
getTableAndInitCtx(WriteOperationType.UPSERT, instantTime);
table.validateUpsertSchema();
setOperationType(WriteOperationType.UPSERT);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.upsert(context, instantTime, records);
if (result.getIndexLookupDuration().isPresent()) {
metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis());
@@ -150,7 +155,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
getTableAndInitCtx(WriteOperationType.UPSERT_PREPPED, instantTime);
table.validateUpsertSchema();
setOperationType(WriteOperationType.UPSERT_PREPPED);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.upsertPrepped(context,instantTime, preppedRecords);
return postWrite(result, instantTime, table);
}
@@ -161,7 +166,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
getTableAndInitCtx(WriteOperationType.INSERT, instantTime);
table.validateInsertSchema();
setOperationType(WriteOperationType.INSERT);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.insert(context,instantTime, records);
return postWrite(result, instantTime, table);
}
@@ -172,7 +177,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
getTableAndInitCtx(WriteOperationType.INSERT_PREPPED, instantTime);
table.validateInsertSchema();
setOperationType(WriteOperationType.INSERT_PREPPED);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.insertPrepped(context,instantTime, preppedRecords);
return postWrite(result, instantTime, table);
}
@@ -188,7 +193,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE, instantTime);
table.validateInsertSchema();
setOperationType(WriteOperationType.INSERT_OVERWRITE);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata result = table.insertOverwrite(context, instantTime, records);
return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds());
}
@@ -205,7 +210,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE_TABLE, instantTime);
table.validateInsertSchema();
setOperationType(WriteOperationType.INSERT_OVERWRITE_TABLE);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata result = table.insertOverwriteTable(context, instantTime, records);
return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds());
}
@@ -221,7 +226,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
getTableAndInitCtx(WriteOperationType.BULK_INSERT, instantTime);
table.validateInsertSchema();
setOperationType(WriteOperationType.BULK_INSERT);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.bulkInsert(context,instantTime, records, userDefinedBulkInsertPartitioner);
return postWrite(result, instantTime, table);
}
@@ -232,7 +237,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime);
table.validateInsertSchema();
setOperationType(WriteOperationType.BULK_INSERT_PREPPED);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.bulkInsertPrepped(context,instantTime, preppedRecords, bulkInsertPartitioner);
return postWrite(result, instantTime, table);
}
@@ -394,4 +399,34 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
}
return table;
}
@Override
public void syncTableMetadata() {
// Open up the metadata table again, for syncing
SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
}
@Override
protected void initWrapperFSMetrics() {
if (config.isMetricsOn()) {
Registry registry;
Registry registryMeta;
JavaSparkContext jsc = ((HoodieSparkEngineContext) context).getJavaSparkContext();
if (config.isExecutorMetricsEnabled()) {
// Create a distributed registry for HoodieWrapperFileSystem
registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName(),
DistributedRegistry.class.getName());
((DistributedRegistry)registry).register(jsc);
registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder",
DistributedRegistry.class.getName());
((DistributedRegistry)registryMeta).register(jsc);
} else {
registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName());
registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder");
}
HoodieWrapperFileSystem.setMetricsRegistry(registry, registryMeta);
}
}
}

View File

@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metrics;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hudi.common.metrics.Registry;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.AccumulatorV2;
/**
* Lightweight Metrics Registry to track Hudi events.
*/
public class DistributedRegistry extends AccumulatorV2<Map<String, Long>, Map<String, Long>>
implements Registry, Serializable {
private String name;
ConcurrentHashMap<String, Long> counters = new ConcurrentHashMap<>();
public DistributedRegistry(String name) {
this.name = name;
}
public void register(JavaSparkContext jsc) {
if (!isRegistered()) {
jsc.sc().register(this);
}
}
@Override
public void clear() {
counters.clear();
}
@Override
public void increment(String name) {
counters.merge(name, 1L, (oldValue, newValue) -> oldValue + newValue);
}
@Override
public void add(String name, long value) {
counters.merge(name, value, (oldValue, newValue) -> oldValue + newValue);
}
/**
* Get all Counter type metrics.
*/
@Override
public Map<String, Long> getAllCounts(boolean prefixWithRegistryName) {
HashMap<String, Long> countersMap = new HashMap<>();
counters.forEach((k, v) -> {
String key = prefixWithRegistryName ? name + "." + k : k;
countersMap.put(key, v);
});
return countersMap;
}
@Override
public void add(Map<String, Long> arg) {
arg.forEach((key, value) -> add(key, value));
}
@Override
public AccumulatorV2<Map<String, Long>, Map<String, Long>> copy() {
DistributedRegistry registry = new DistributedRegistry(name);
counters.forEach((key, value) -> registry.add(key, value));
return registry;
}
@Override
public boolean isZero() {
return counters.isEmpty();
}
@Override
public void merge(AccumulatorV2<Map<String, Long>, Map<String, Long>> acc) {
acc.value().forEach((key, value) -> add(key, value));
}
@Override
public void reset() {
counters.clear();
}
@Override
public Map<String, Long> value() {
return counters;
}
}

View File

@@ -0,0 +1,186 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metrics;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieEngineContext;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter;
import org.apache.hudi.metadata.HoodieMetadataMetrics;
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
import org.apache.hudi.metadata.MetadataPartitionType;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter {
private static final Logger LOG = LogManager.getLogger(SparkHoodieBackedTableMetadataWriter.class);
public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, HoodieEngineContext context) {
return new SparkHoodieBackedTableMetadataWriter(conf, writeConfig, context);
}
SparkHoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, HoodieEngineContext engineContext) {
super(hadoopConf, writeConfig, engineContext);
}
@Override
protected void initRegistry() {
if (metadataWriteConfig.isMetricsOn()) {
Registry registry;
if (metadataWriteConfig.isExecutorMetricsEnabled()) {
registry = Registry.getRegistry("HoodieMetadata", DistributedRegistry.class.getName());
} else {
registry = Registry.getRegistry("HoodieMetadata");
}
this.metrics = Option.of(new HoodieMetadataMetrics(registry));
} else {
this.metrics = Option.empty();
}
}
@Override
protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient) {
try {
metrics.map(HoodieMetadataMetrics::registry).ifPresent(registry -> {
if (registry instanceof DistributedRegistry) {
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext;
((DistributedRegistry) registry).register(sparkEngineContext.getJavaSparkContext());
}
});
if (enabled) {
bootstrapIfNeeded(engineContext, datasetMetaClient);
}
} catch (IOException e) {
LOG.error("Failed to initialize metadata table. Disabling the writer.", e);
enabled = false;
}
}
@Override
protected void commit(List<HoodieRecord> records, String partitionName, String instantTime) {
ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled");
metadata.closeReaders();
JavaRDD<HoodieRecord> recordRDD = prepRecords(records, partitionName);
try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) {
writeClient.startCommitWithTime(instantTime);
List<WriteStatus> statuses = writeClient.upsertPreppedRecords(recordRDD, instantTime).collect();
statuses.forEach(writeStatus -> {
if (writeStatus.hasErrors()) {
throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime);
}
});
// trigger cleaning, compaction, with suffixes based on the same instant time. This ensures that any future
// delta commits synced over will not have an instant time lesser than the last completed instant on the
// metadata table.
if (writeClient.scheduleCompactionAtInstant(instantTime + "001", Option.empty())) {
writeClient.compact(instantTime + "001");
}
writeClient.clean(instantTime + "002");
}
// Update total size of the metadata and count of base/log files
metrics.ifPresent(m -> {
try {
Map<String, String> stats = m.getStats(false, metaClient, metadata);
m.updateMetrics(Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)),
Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE)),
Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)),
Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_LOG_FILES)));
} catch (HoodieIOException e) {
LOG.error("Could not publish metadata size metrics", e);
}
});
}
/**
* Tag each record with the location.
*
* Since we only read the latest base file in a partition, we tag the records with the instant time of the latest
* base file.
*/
private JavaRDD<HoodieRecord> prepRecords(List<HoodieRecord> records, String partitionName) {
HoodieTable table = HoodieSparkTable.create(metadataWriteConfig, engineContext);
TableFileSystemView.SliceView fsView = table.getSliceView();
List<HoodieBaseFile> baseFiles = fsView.getLatestFileSlices(partitionName)
.map(FileSlice::getBaseFile)
.filter(Option::isPresent)
.map(Option::get)
.collect(Collectors.toList());
// All the metadata fits within a single base file
if (partitionName.equals(MetadataPartitionType.FILES.partitionPath())) {
if (baseFiles.size() > 1) {
throw new HoodieMetadataException("Multiple base files found in metadata partition");
}
}
JavaSparkContext jsc = ((HoodieSparkEngineContext) engineContext).getJavaSparkContext();
String fileId;
String instantTime;
if (!baseFiles.isEmpty()) {
fileId = baseFiles.get(0).getFileId();
instantTime = baseFiles.get(0).getCommitTime();
} else {
// If there is a log file then we can assume that it has the data
List<HoodieLogFile> logFiles = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath())
.map(FileSlice::getLatestLogFile)
.filter(Option::isPresent)
.map(Option::get)
.collect(Collectors.toList());
if (logFiles.isEmpty()) {
// No base and log files. All are new inserts
return jsc.parallelize(records, 1);
}
fileId = logFiles.get(0).getFileId();
instantTime = logFiles.get(0).getBaseCommitTime();
}
return jsc.parallelize(records, 1).map(r -> r.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId)));
}
}