[HUDI-842] Implementation of HUDI RFC-15.
- Introduced an internal metadata table, that stores file listings. - metadata table is kept upto date with - Fixed handling of CleanerPlan. - [HUDI-842] Reduce parallelism to speed up the test. - [HUDI-842] Implementation of CLI commands for metadata operations and lookups. - [HUDI-842] Extend rollback metadata to include the files which have been appended to. - [HUDI-842] Support for rollbacks in MOR Table. - MarkerBasedRollbackStrategy needs to correctly provide the list of files for which rollback blocks were appended. - [HUDI-842] Added unit test for rollback of partial commits (inflight but not completed yet). - [HUDI-842] Handled the error case where metadata update succeeds but dataset commit fails. - [HUDI-842] Schema evolution strategy for Metadata Table. Each type of metadata saved (FilesystemMetadata, ColumnIndexMetadata, etc.) will be a separate field with default null. The type of the record will identify the valid field. This way, we can grow the schema when new type of information is saved within in which still keeping it backward compatible. - [HUDI-842] Fix non-partitioned case and speedup initial creation of metadata table.Choose only 1 partition for jsc as the number of records is low (hundreds to thousands). There is more overhead of creating large number of partitions for JavaRDD and it slows down operations like WorkloadProfile. For the non-partitioned case, use "." as the name of the partition to prevent empty keys in HFile. - [HUDI-842] Reworked metrics pusblishing. - Code has been split into reader and writer side. HoodieMetadata code to be accessed by using HoodieTable.metadata() to get instance of metdata for the table. Code is serializable to allow executors to use the functionality. - [RFC-15] Add metrics to track the time for each file system call. - [RFC-15] Added a distributed metrics registry for spark which can be used to collect metrics from executors. This helps create a stats dashboard which shows the metadata table improvements in real-time for production tables. - [HUDI-1321] Created HoodieMetadataConfig to specify configuration for the metadata table. This is safer than full-fledged properties for the metadata table (like HoodieWriteConfig) as it makes burdensome to tune the metadata. With limited configuration, we can control the performance of the metadata table closely. [HUDI-1319][RFC-15] Adding interfaces for HoodieMetadata, HoodieMetadataWriter (apache#2266) - moved MetadataReader to HoodieBackedTableMetadata, under the HoodieTableMetadata interface - moved MetadataWriter to HoodieBackedTableMetadataWriter, under the HoodieTableMetadataWriter - Pulled all the metrics into HoodieMetadataMetrics - Writer now wraps the metadata, instead of extending it - New enum for MetadataPartitionType - Streamlined code flow inside HoodieBackedTableMetadataWriter w.r.t initializing metadata state - [HUDI-1319] Make async operations work with metadata table (apache#2332) - Changes the syncing model to only move over completed instants on data timeline - Syncing happens postCommit and on writeClient initialization - Latest delta commit on the metadata table is sufficient as the watermark for data timeline archival - Cleaning/Compaction use a suffix to the last instant written to metadata table, such that we keep the 1-1 - .. mapping between data and metadata timelines. - Got rid of a lot of the complexity around checking for valid commits during open of base/log files - Tests now use local FS, to simulate more failure scenarios - Some failure scenarios exposed HUDI-1434, which is needed for MOR to work correctly co-authored by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
committed by
vinoth chandar
parent
c3e9243ea1
commit
298808baaf
@@ -23,6 +23,8 @@ import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
import org.apache.hudi.common.metrics.Registry;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
@@ -41,6 +43,8 @@ import org.apache.hudi.exception.HoodieClusteringException;
|
||||
import org.apache.hudi.exception.HoodieCommitException;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
import org.apache.hudi.metrics.DistributedRegistry;
|
||||
import org.apache.hudi.metrics.SparkHoodieBackedTableMetadataWriter;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
@@ -51,6 +55,7 @@ import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
@@ -136,7 +141,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
getTableAndInitCtx(WriteOperationType.UPSERT, instantTime);
|
||||
table.validateUpsertSchema();
|
||||
setOperationType(WriteOperationType.UPSERT);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.upsert(context, instantTime, records);
|
||||
if (result.getIndexLookupDuration().isPresent()) {
|
||||
metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis());
|
||||
@@ -150,7 +155,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
getTableAndInitCtx(WriteOperationType.UPSERT_PREPPED, instantTime);
|
||||
table.validateUpsertSchema();
|
||||
setOperationType(WriteOperationType.UPSERT_PREPPED);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.upsertPrepped(context,instantTime, preppedRecords);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
@@ -161,7 +166,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
getTableAndInitCtx(WriteOperationType.INSERT, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.insert(context,instantTime, records);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
@@ -172,7 +177,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
getTableAndInitCtx(WriteOperationType.INSERT_PREPPED, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT_PREPPED);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.insertPrepped(context,instantTime, preppedRecords);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
@@ -188,7 +193,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT_OVERWRITE);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata result = table.insertOverwrite(context, instantTime, records);
|
||||
return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds());
|
||||
}
|
||||
@@ -205,7 +210,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE_TABLE, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT_OVERWRITE_TABLE);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata result = table.insertOverwriteTable(context, instantTime, records);
|
||||
return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds());
|
||||
}
|
||||
@@ -221,7 +226,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
getTableAndInitCtx(WriteOperationType.BULK_INSERT, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.BULK_INSERT);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.bulkInsert(context,instantTime, records, userDefinedBulkInsertPartitioner);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
@@ -232,7 +237,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.BULK_INSERT_PREPPED);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.bulkInsertPrepped(context,instantTime, preppedRecords, bulkInsertPartitioner);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
@@ -394,4 +399,34 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncTableMetadata() {
|
||||
// Open up the metadata table again, for syncing
|
||||
SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void initWrapperFSMetrics() {
|
||||
if (config.isMetricsOn()) {
|
||||
Registry registry;
|
||||
Registry registryMeta;
|
||||
JavaSparkContext jsc = ((HoodieSparkEngineContext) context).getJavaSparkContext();
|
||||
|
||||
if (config.isExecutorMetricsEnabled()) {
|
||||
// Create a distributed registry for HoodieWrapperFileSystem
|
||||
registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName(),
|
||||
DistributedRegistry.class.getName());
|
||||
((DistributedRegistry)registry).register(jsc);
|
||||
registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder",
|
||||
DistributedRegistry.class.getName());
|
||||
((DistributedRegistry)registryMeta).register(jsc);
|
||||
} else {
|
||||
registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName());
|
||||
registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder");
|
||||
}
|
||||
|
||||
HoodieWrapperFileSystem.setMetricsRegistry(registry, registryMeta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.metrics;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.apache.hudi.common.metrics.Registry;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.util.AccumulatorV2;
|
||||
|
||||
/**
|
||||
* Lightweight Metrics Registry to track Hudi events.
|
||||
*/
|
||||
public class DistributedRegistry extends AccumulatorV2<Map<String, Long>, Map<String, Long>>
|
||||
implements Registry, Serializable {
|
||||
private String name;
|
||||
ConcurrentHashMap<String, Long> counters = new ConcurrentHashMap<>();
|
||||
|
||||
public DistributedRegistry(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void register(JavaSparkContext jsc) {
|
||||
if (!isRegistered()) {
|
||||
jsc.sc().register(this);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
counters.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(String name) {
|
||||
counters.merge(name, 1L, (oldValue, newValue) -> oldValue + newValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(String name, long value) {
|
||||
counters.merge(name, value, (oldValue, newValue) -> oldValue + newValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all Counter type metrics.
|
||||
*/
|
||||
@Override
|
||||
public Map<String, Long> getAllCounts(boolean prefixWithRegistryName) {
|
||||
HashMap<String, Long> countersMap = new HashMap<>();
|
||||
counters.forEach((k, v) -> {
|
||||
String key = prefixWithRegistryName ? name + "." + k : k;
|
||||
countersMap.put(key, v);
|
||||
});
|
||||
return countersMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(Map<String, Long> arg) {
|
||||
arg.forEach((key, value) -> add(key, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public AccumulatorV2<Map<String, Long>, Map<String, Long>> copy() {
|
||||
DistributedRegistry registry = new DistributedRegistry(name);
|
||||
counters.forEach((key, value) -> registry.add(key, value));
|
||||
return registry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isZero() {
|
||||
return counters.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void merge(AccumulatorV2<Map<String, Long>, Map<String, Long>> acc) {
|
||||
acc.value().forEach((key, value) -> add(key, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
counters.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Long> value() {
|
||||
return counters;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.metrics;
|
||||
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.metrics.Registry;
|
||||
import org.apache.hudi.common.model.FileSlice;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieMetadataException;
|
||||
import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter;
|
||||
import org.apache.hudi.metadata.HoodieMetadataMetrics;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
||||
import org.apache.hudi.metadata.MetadataPartitionType;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkHoodieBackedTableMetadataWriter.class);
|
||||
|
||||
public static HoodieTableMetadataWriter create(Configuration conf, HoodieWriteConfig writeConfig, HoodieEngineContext context) {
|
||||
return new SparkHoodieBackedTableMetadataWriter(conf, writeConfig, context);
|
||||
}
|
||||
|
||||
SparkHoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, HoodieEngineContext engineContext) {
|
||||
super(hadoopConf, writeConfig, engineContext);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void initRegistry() {
|
||||
if (metadataWriteConfig.isMetricsOn()) {
|
||||
Registry registry;
|
||||
if (metadataWriteConfig.isExecutorMetricsEnabled()) {
|
||||
registry = Registry.getRegistry("HoodieMetadata", DistributedRegistry.class.getName());
|
||||
} else {
|
||||
registry = Registry.getRegistry("HoodieMetadata");
|
||||
}
|
||||
this.metrics = Option.of(new HoodieMetadataMetrics(registry));
|
||||
} else {
|
||||
this.metrics = Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient) {
|
||||
try {
|
||||
metrics.map(HoodieMetadataMetrics::registry).ifPresent(registry -> {
|
||||
if (registry instanceof DistributedRegistry) {
|
||||
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext;
|
||||
((DistributedRegistry) registry).register(sparkEngineContext.getJavaSparkContext());
|
||||
}
|
||||
});
|
||||
|
||||
if (enabled) {
|
||||
bootstrapIfNeeded(engineContext, datasetMetaClient);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to initialize metadata table. Disabling the writer.", e);
|
||||
enabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void commit(List<HoodieRecord> records, String partitionName, String instantTime) {
|
||||
ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled");
|
||||
metadata.closeReaders();
|
||||
|
||||
JavaRDD<HoodieRecord> recordRDD = prepRecords(records, partitionName);
|
||||
|
||||
try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) {
|
||||
writeClient.startCommitWithTime(instantTime);
|
||||
List<WriteStatus> statuses = writeClient.upsertPreppedRecords(recordRDD, instantTime).collect();
|
||||
statuses.forEach(writeStatus -> {
|
||||
if (writeStatus.hasErrors()) {
|
||||
throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime);
|
||||
}
|
||||
});
|
||||
// trigger cleaning, compaction, with suffixes based on the same instant time. This ensures that any future
|
||||
// delta commits synced over will not have an instant time lesser than the last completed instant on the
|
||||
// metadata table.
|
||||
if (writeClient.scheduleCompactionAtInstant(instantTime + "001", Option.empty())) {
|
||||
writeClient.compact(instantTime + "001");
|
||||
}
|
||||
writeClient.clean(instantTime + "002");
|
||||
}
|
||||
|
||||
// Update total size of the metadata and count of base/log files
|
||||
metrics.ifPresent(m -> {
|
||||
try {
|
||||
Map<String, String> stats = m.getStats(false, metaClient, metadata);
|
||||
m.updateMetrics(Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)),
|
||||
Long.parseLong(stats.get(HoodieMetadataMetrics.STAT_TOTAL_LOG_FILE_SIZE)),
|
||||
Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)),
|
||||
Integer.parseInt(stats.get(HoodieMetadataMetrics.STAT_COUNT_LOG_FILES)));
|
||||
} catch (HoodieIOException e) {
|
||||
LOG.error("Could not publish metadata size metrics", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Tag each record with the location.
|
||||
*
|
||||
* Since we only read the latest base file in a partition, we tag the records with the instant time of the latest
|
||||
* base file.
|
||||
*/
|
||||
private JavaRDD<HoodieRecord> prepRecords(List<HoodieRecord> records, String partitionName) {
|
||||
HoodieTable table = HoodieSparkTable.create(metadataWriteConfig, engineContext);
|
||||
TableFileSystemView.SliceView fsView = table.getSliceView();
|
||||
List<HoodieBaseFile> baseFiles = fsView.getLatestFileSlices(partitionName)
|
||||
.map(FileSlice::getBaseFile)
|
||||
.filter(Option::isPresent)
|
||||
.map(Option::get)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// All the metadata fits within a single base file
|
||||
if (partitionName.equals(MetadataPartitionType.FILES.partitionPath())) {
|
||||
if (baseFiles.size() > 1) {
|
||||
throw new HoodieMetadataException("Multiple base files found in metadata partition");
|
||||
}
|
||||
}
|
||||
|
||||
JavaSparkContext jsc = ((HoodieSparkEngineContext) engineContext).getJavaSparkContext();
|
||||
String fileId;
|
||||
String instantTime;
|
||||
if (!baseFiles.isEmpty()) {
|
||||
fileId = baseFiles.get(0).getFileId();
|
||||
instantTime = baseFiles.get(0).getCommitTime();
|
||||
} else {
|
||||
// If there is a log file then we can assume that it has the data
|
||||
List<HoodieLogFile> logFiles = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath())
|
||||
.map(FileSlice::getLatestLogFile)
|
||||
.filter(Option::isPresent)
|
||||
.map(Option::get)
|
||||
.collect(Collectors.toList());
|
||||
if (logFiles.isEmpty()) {
|
||||
// No base and log files. All are new inserts
|
||||
return jsc.parallelize(records, 1);
|
||||
}
|
||||
|
||||
fileId = logFiles.get(0).getFileId();
|
||||
instantTime = logFiles.get(0).getBaseCommitTime();
|
||||
}
|
||||
|
||||
return jsc.parallelize(records, 1).map(r -> r.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId)));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user