Fixes needed to run merge-on-read testing on production scale data
This commit is contained in:
committed by
prazanna
parent
57ab7a2405
commit
aee136777b
@@ -26,6 +26,7 @@ import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
|
||||
import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.HoodieRollbackStat;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
@@ -39,6 +40,7 @@ import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
@@ -57,7 +59,6 @@ import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
@@ -365,6 +366,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
new HoodieInstant(true, actionType, commitTime),
|
||||
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
// Save was a success
|
||||
// Do a inline compaction if enabled
|
||||
if (config.isInlineCompaction()) {
|
||||
Optional<HoodieCompactionMetadata> compactionMetadata = table.compact(jsc);
|
||||
if (compactionMetadata.isPresent()) {
|
||||
logger.info("Compacted successfully on commit " + commitTime);
|
||||
metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "true");
|
||||
} else {
|
||||
logger.info("Compaction did not run for commit " + commitTime);
|
||||
metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "false");
|
||||
}
|
||||
}
|
||||
|
||||
// We cannot have unbounded commit files. Archive commits if we have to archive
|
||||
archiveLog.archiveIfRequired();
|
||||
if(config.isAutoClean()) {
|
||||
@@ -785,11 +798,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
|
||||
public void startCommitWithTime(String commitTime) {
|
||||
logger.info("Generate a new commit time " + commitTime);
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(fs, config.getBasePath(), true);
|
||||
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
String commitActionType = table.getCommitActionType();
|
||||
activeTimeline.createInflight(
|
||||
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime));
|
||||
new HoodieInstant(true, commitActionType, commitTime));
|
||||
}
|
||||
|
||||
public static SparkConf registerClasses(SparkConf conf) {
|
||||
@@ -829,11 +843,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @throws IOException
|
||||
*/
|
||||
private void rollbackInflightCommits() {
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(fs, config.getBasePath(), true);
|
||||
HoodieTimeline inflightTimeline =
|
||||
metaClient.getActiveTimeline().getCommitTimeline().filterInflights();
|
||||
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTimeline inflightTimeline = table.getCommitTimeline().filterInflights();
|
||||
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
Collections.reverse(commits);
|
||||
|
||||
@@ -37,6 +37,14 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
public static final String AUTO_CLEAN_PROP = "hoodie.clean.automatic";
|
||||
private static final String DEFAULT_AUTO_CLEAN = "true";
|
||||
|
||||
// Turn on inline compaction - after fw delta commits a inline compaction will be run
|
||||
public static final String INLINE_COMPACT_PROP = "hoodie.compact.inline";
|
||||
private static final String DEFAULT_INLINE_COMPACT = "true";
|
||||
|
||||
// Run a compaction every N delta commits
|
||||
public static final String INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = "hoodie.compact.inline.max.delta.commits";
|
||||
private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = "4";
|
||||
|
||||
public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP =
|
||||
"hoodie.cleaner.fileversions.retained";
|
||||
private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3";
|
||||
@@ -102,6 +110,16 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withInlineCompaction(Boolean inlineCompaction) {
|
||||
props.setProperty(INLINE_COMPACT_PROP, String.valueOf(inlineCompaction));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder inlineCompactionEvery(int deltaCommits) {
|
||||
props.setProperty(INLINE_COMPACT_PROP, String.valueOf(deltaCommits));
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withCleanerPolicy(HoodieCleaningPolicy policy) {
|
||||
props.setProperty(CLEANER_POLICY_PROP, policy.name());
|
||||
return this;
|
||||
@@ -153,6 +171,10 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
HoodieCompactionConfig config = new HoodieCompactionConfig(props);
|
||||
setDefaultOnCondition(props, !props.containsKey(AUTO_CLEAN_PROP),
|
||||
AUTO_CLEAN_PROP, DEFAULT_AUTO_CLEAN);
|
||||
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_PROP),
|
||||
INLINE_COMPACT_PROP, DEFAULT_INLINE_COMPACT);
|
||||
setDefaultOnCondition(props, !props.containsKey(INLINE_COMPACT_NUM_DELTA_COMMITS_PROP),
|
||||
INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS);
|
||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP),
|
||||
CLEANER_POLICY_PROP, DEFAULT_CLEANER_POLICY);
|
||||
setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP),
|
||||
|
||||
@@ -19,6 +19,7 @@ package com.uber.hoodie.config;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig.Builder;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.io.HoodieCleaner;
|
||||
import com.uber.hoodie.metrics.MetricsReporterType;
|
||||
@@ -148,6 +149,15 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
|
||||
return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.AUTO_CLEAN_PROP));
|
||||
}
|
||||
|
||||
public boolean isInlineCompaction() {
|
||||
return Boolean.parseBoolean(props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_PROP));
|
||||
}
|
||||
|
||||
public int getInlineCompactDeltaCommitMax() {
|
||||
return Integer.parseInt(
|
||||
props.getProperty(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP));
|
||||
}
|
||||
|
||||
/**
|
||||
* index properties
|
||||
**/
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.exception;
|
||||
|
||||
public class HoodieCompactionException extends HoodieException {
|
||||
public HoodieCompactionException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
public HoodieCompactionException(String msg, Throwable e) {
|
||||
super(msg, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,7 +101,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
||||
throw new HoodieUpsertException(
|
||||
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId
|
||||
+ " on commit " + commitTime + " on HDFS path " + hoodieTable
|
||||
.getMetaClient().getBasePath());
|
||||
.getMetaClient().getBasePath() + partitionPath, e);
|
||||
}
|
||||
writeStatus.getStat().setFullPath(currentLogFile.getPath().toString());
|
||||
}
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Spliterator;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.function.Consumer;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
/**
|
||||
* This reads a bunch of HoodieRecords from avro log files and deduplicates and mantains the merged
|
||||
* state in memory. This is useful for compaction and record reader
|
||||
*/
|
||||
public class HoodieAvroReader implements Iterable<HoodieRecord<HoodieAvroPayload>> {
|
||||
|
||||
private final Collection<HoodieRecord<HoodieAvroPayload>> records;
|
||||
private AtomicLong totalLogFiles = new AtomicLong(0);
|
||||
private AtomicLong totalLogRecords = new AtomicLong(0);
|
||||
private long totalRecordsToUpdate;
|
||||
|
||||
|
||||
public HoodieAvroReader(FileSystem fs, List<String> logFilePaths, Schema readerSchema) {
|
||||
Map<String, HoodieRecord<HoodieAvroPayload>> records = Maps.newHashMap();
|
||||
for (String path : logFilePaths) {
|
||||
totalLogFiles.incrementAndGet();
|
||||
List<HoodieRecord<HoodieAvroPayload>> recordsFromFile = AvroUtils
|
||||
.loadFromFile(fs, path, readerSchema);
|
||||
totalLogRecords.addAndGet(recordsFromFile.size());
|
||||
for (HoodieRecord<HoodieAvroPayload> recordFromFile : recordsFromFile) {
|
||||
String key = recordFromFile.getRecordKey();
|
||||
if (records.containsKey(key)) {
|
||||
// Merge and store the merged record
|
||||
HoodieAvroPayload combinedValue = records.get(key).getData()
|
||||
.preCombine(recordFromFile.getData());
|
||||
records.put(key, new HoodieRecord<>(new HoodieKey(key, recordFromFile.getPartitionPath()),
|
||||
combinedValue));
|
||||
} else {
|
||||
// Put the record as is
|
||||
records.put(key, recordFromFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
this.records = records.values();
|
||||
this.totalRecordsToUpdate = records.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<HoodieRecord<HoodieAvroPayload>> iterator() {
|
||||
return records.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEach(Consumer<? super HoodieRecord<HoodieAvroPayload>> consumer) {
|
||||
records.forEach(consumer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Spliterator<HoodieRecord<HoodieAvroPayload>> spliterator() {
|
||||
return records.spliterator();
|
||||
}
|
||||
|
||||
public long getTotalLogFiles() {
|
||||
return totalLogFiles.get();
|
||||
}
|
||||
|
||||
public long getTotalLogRecords() {
|
||||
return totalLogRecords.get();
|
||||
}
|
||||
|
||||
public long getTotalRecordsToUpdate() {
|
||||
return totalRecordsToUpdate;
|
||||
}
|
||||
}
|
||||
@@ -22,8 +22,10 @@ import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFile;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
@@ -107,8 +109,18 @@ public class HoodieCleaner<T extends HoodieRecordPayload<T>> {
|
||||
}
|
||||
// Delete the remaining files
|
||||
while (commitItr.hasNext()) {
|
||||
HoodieDataFile nextRecord = commitItr.next();
|
||||
deletePaths.add(String.format("%s/%s/%s", config.getBasePath(), partitionPath,
|
||||
commitItr.next().getFileName()));
|
||||
nextRecord.getFileName()));
|
||||
if (hoodieTable.getMetaClient().getTableType()
|
||||
== HoodieTableType.MERGE_ON_READ) {
|
||||
// If merge on read, then clean the log files for the commits as well
|
||||
deletePaths.add(String
|
||||
.format("%s/%s/%s", config.getBasePath(), partitionPath,
|
||||
FSUtils.maskWithoutLogVersion(nextRecord.getCommitTime(),
|
||||
nextRecord.getFileId(),
|
||||
HoodieLogFile.DELTA_EXTENSION)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return deletePaths;
|
||||
@@ -182,6 +194,14 @@ public class HoodieCleaner<T extends HoodieRecordPayload<T>> {
|
||||
deletePaths.add(String
|
||||
.format("%s/%s/%s", config.getBasePath(), partitionPath, FSUtils
|
||||
.maskWithoutTaskPartitionId(fileCommitTime, afile.getFileId())));
|
||||
if (hoodieTable.getMetaClient().getTableType()
|
||||
== HoodieTableType.MERGE_ON_READ) {
|
||||
// If merge on read, then clean the log files for the commits as well
|
||||
deletePaths.add(String
|
||||
.format("%s/%s/%s", config.getBasePath(), partitionPath,
|
||||
FSUtils.maskWithoutLogVersion(fileCommitTime, afile.getFileId(),
|
||||
HoodieLogFile.DELTA_EXTENSION)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
|
||||
/**
|
||||
* Place holder for the compaction specific meta-data, uses all the details used in a normal HoodieCommitMetadata
|
||||
*/
|
||||
public class HoodieCompactionMetadata extends HoodieCommitMetadata {
|
||||
|
||||
}
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
|
||||
@@ -16,9 +16,14 @@
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.CompactionWriteStat;
|
||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.model.HoodieWriteStat;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
@@ -28,18 +33,19 @@ import com.uber.hoodie.common.util.AvroUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.io.HoodieAvroReader;
|
||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||
import com.uber.hoodie.table.HoodieMergeOnReadTable;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.util.Collection;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.commons.collections.IteratorUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -48,7 +54,6 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static java.util.stream.Collectors.*;
|
||||
|
||||
@@ -60,118 +65,129 @@ import static java.util.stream.Collectors.*;
|
||||
* @see HoodieCompactor
|
||||
*/
|
||||
public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
private static Logger log = LogManager.getLogger(HoodieRealtimeTableCompactor.class);
|
||||
|
||||
@Override
|
||||
public HoodieCompactionMetadata compact(JavaSparkContext jsc, HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable,
|
||||
CompactionFilter compactionFilter) throws Exception {
|
||||
// TODO - rollback any compactions in flight
|
||||
private static Logger log = LogManager.getLogger(HoodieRealtimeTableCompactor.class);
|
||||
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
String compactionCommit = startCompactionCommit(hoodieTable);
|
||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
|
||||
List<String> partitionPaths =
|
||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath());
|
||||
@Override
|
||||
public HoodieCompactionMetadata compact(JavaSparkContext jsc, HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable, CompactionFilter compactionFilter) throws IOException {
|
||||
Preconditions.checkArgument(
|
||||
hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
||||
"HoodieRealtimeTableCompactor can only compact table of type "
|
||||
+ HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
|
||||
.getTableType().name());
|
||||
|
||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||
List<CompactionOperation> operations =
|
||||
jsc.parallelize(partitionPaths, partitionPaths.size())
|
||||
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> hoodieTable.getFileSystemView()
|
||||
.groupLatestDataFileWithLogFiles(partitionPath).entrySet()
|
||||
.stream()
|
||||
.map(s -> new CompactionOperation(s.getKey(), partitionPath, s.getValue()))
|
||||
.collect(toList()).iterator()).collect();
|
||||
log.info("Total of " + operations.size() + " compactions are retrieved");
|
||||
// TODO - rollback any compactions in flight
|
||||
|
||||
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only
|
||||
operations = compactionFilter.filter(operations);
|
||||
if(operations.isEmpty()) {
|
||||
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||
return null;
|
||||
}
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
String compactionCommit = startCompactionCommit(hoodieTable);
|
||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
|
||||
List<String> partitionPaths =
|
||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath());
|
||||
|
||||
log.info("After filtering, Compacting " + operations + " files");
|
||||
List<Tuple2<String, HoodieWriteStat>> updateStatusMap =
|
||||
jsc.parallelize(operations, operations.size()).map(
|
||||
(Function<CompactionOperation, Iterator<List<WriteStatus>>>) compactionOperation -> executeCompaction(
|
||||
metaClient, config, compactionOperation, compactionCommit)).flatMap(
|
||||
(FlatMapFunction<Iterator<List<WriteStatus>>, WriteStatus>) listIterator -> {
|
||||
List<List<WriteStatus>> collected = IteratorUtils.toList(listIterator);
|
||||
return collected.stream().flatMap(List::stream).collect(toList()).iterator();
|
||||
}).mapToPair(new PairFunction<WriteStatus, String, HoodieWriteStat>() {
|
||||
@Override
|
||||
public Tuple2<String, HoodieWriteStat> call(WriteStatus writeStatus)
|
||||
throws Exception {
|
||||
return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat());
|
||||
}
|
||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||
List<CompactionOperation> operations =
|
||||
jsc.parallelize(partitionPaths, partitionPaths.size())
|
||||
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> hoodieTable
|
||||
.getFileSystemView()
|
||||
.groupLatestDataFileWithLogFiles(partitionPath).entrySet()
|
||||
.stream()
|
||||
.map(s -> new CompactionOperation(s.getKey(), partitionPath, s.getValue()))
|
||||
.collect(toList()).iterator()).collect();
|
||||
log.info("Total of " + operations.size() + " compactions are retrieved");
|
||||
|
||||
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only
|
||||
operations = compactionFilter.filter(operations);
|
||||
if (operations.isEmpty()) {
|
||||
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||
return null;
|
||||
}
|
||||
|
||||
log.info("After filtering, Compacting " + operations + " files");
|
||||
List<CompactionWriteStat> updateStatusMap =
|
||||
jsc.parallelize(operations, operations.size())
|
||||
.map(s -> executeCompaction(metaClient, config, s, compactionCommit))
|
||||
.flatMap(new FlatMapFunction<List<CompactionWriteStat>, CompactionWriteStat>() {
|
||||
@Override
|
||||
public Iterator<CompactionWriteStat> call(
|
||||
List<CompactionWriteStat> compactionWriteStats)
|
||||
throws Exception {
|
||||
return compactionWriteStats.iterator();
|
||||
}
|
||||
}).collect();
|
||||
|
||||
HoodieCompactionMetadata metadata = new HoodieCompactionMetadata();
|
||||
for (Tuple2<String, HoodieWriteStat> stat : updateStatusMap) {
|
||||
metadata.addWriteStat(stat._1(), stat._2());
|
||||
}
|
||||
log.info("Compaction finished with result " + metadata);
|
||||
HoodieCompactionMetadata metadata = new HoodieCompactionMetadata();
|
||||
for (CompactionWriteStat stat : updateStatusMap) {
|
||||
metadata.addWriteStat(stat.getPartitionPath(), stat);
|
||||
}
|
||||
log.info("Compaction finished with result " + metadata);
|
||||
|
||||
//noinspection ConstantConditions
|
||||
if (isCompactionSucceeded(metadata)) {
|
||||
log.info("Compaction succeeded " + compactionCommit);
|
||||
commitCompaction(compactionCommit, metaClient, metadata);
|
||||
} else {
|
||||
log.info("Compaction failed " + compactionCommit);
|
||||
}
|
||||
return metadata;
|
||||
//noinspection ConstantConditions
|
||||
if (isCompactionSucceeded(metadata)) {
|
||||
log.info("Compaction succeeded " + compactionCommit);
|
||||
commitCompaction(compactionCommit, metaClient, metadata);
|
||||
} else {
|
||||
log.info("Compaction failed " + compactionCommit);
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
private boolean isCompactionSucceeded(HoodieCompactionMetadata result) {
|
||||
//TODO figure out a success factor for a compaction
|
||||
return true;
|
||||
}
|
||||
|
||||
private List<CompactionWriteStat> executeCompaction(HoodieTableMetaClient metaClient,
|
||||
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
|
||||
throws IOException {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
Schema readerSchema =
|
||||
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||
|
||||
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
|
||||
.getDeltaFilePaths() + " for commit " + commitTime);
|
||||
// TODO - FIX THIS
|
||||
// Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
|
||||
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
|
||||
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
||||
|
||||
HoodieAvroReader avroReader = new HoodieAvroReader(fs, operation.getDeltaFilePaths(),
|
||||
readerSchema);
|
||||
if (!avroReader.iterator().hasNext()) {
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
private boolean isCompactionSucceeded(HoodieCompactionMetadata result) {
|
||||
//TODO figure out a success factor for a compaction
|
||||
return true;
|
||||
}
|
||||
|
||||
private Iterator<List<WriteStatus>> executeCompaction(HoodieTableMetaClient metaClient,
|
||||
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
|
||||
throws IOException {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
Schema schema =
|
||||
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||
|
||||
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
|
||||
.getDeltaFilePaths() + " for commit " + commitTime);
|
||||
// TODO - FIX THIS
|
||||
// 1. Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
|
||||
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
|
||||
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
||||
|
||||
// 2. naively loads all the delta records in memory to merge it,
|
||||
// since we only need a iterator, we could implement a lazy iterator to load from one delta file at a time
|
||||
List<HoodieRecord<HoodieAvroPayload>> readDeltaFilesInMemory =
|
||||
AvroUtils.loadFromFiles(fs, operation.getDeltaFilePaths(), schema);
|
||||
|
||||
if(readDeltaFilesInMemory.isEmpty()) {
|
||||
return IteratorUtils.emptyIterator();
|
||||
}
|
||||
|
||||
// Compacting is very similar to applying updates to existing file
|
||||
HoodieCopyOnWriteTable<HoodieAvroPayload> table =
|
||||
new HoodieCopyOnWriteTable<>(config, metaClient);
|
||||
return table
|
||||
.handleUpdate(commitTime, operation.getFileId(), readDeltaFilesInMemory.iterator());
|
||||
}
|
||||
|
||||
public boolean commitCompaction(String commitTime, HoodieTableMetaClient metaClient,
|
||||
HoodieCompactionMetadata metadata) {
|
||||
log.info("Comitting " + commitTime);
|
||||
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
|
||||
|
||||
try {
|
||||
activeTimeline.saveAsComplete(
|
||||
new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime),
|
||||
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCommitException(
|
||||
"Failed to commit " + metaClient.getBasePath() + " at time " + commitTime, e);
|
||||
}
|
||||
return true;
|
||||
// Compacting is very similar to applying updates to existing file
|
||||
HoodieCopyOnWriteTable<HoodieAvroPayload> table =
|
||||
new HoodieCopyOnWriteTable<>(config, metaClient);
|
||||
Iterator<List<WriteStatus>> result = table
|
||||
.handleUpdate(commitTime, operation.getFileId(), avroReader.iterator());
|
||||
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
||||
return StreamSupport.stream(resultIterable.spliterator(), false)
|
||||
.flatMap(Collection::stream)
|
||||
.map(WriteStatus::getStat)
|
||||
.map(s -> CompactionWriteStat.newBuilder().withHoodieWriteStat(s)
|
||||
.setTotalRecordsToUpdate(avroReader.getTotalRecordsToUpdate())
|
||||
.setTotalLogFiles(avroReader.getTotalLogFiles())
|
||||
.setTotalLogRecords(avroReader.getTotalLogRecords())
|
||||
.onPartition(operation.getPartitionPath()).build())
|
||||
.collect(toList());
|
||||
}
|
||||
|
||||
public boolean commitCompaction(String commitTime, HoodieTableMetaClient metaClient,
|
||||
HoodieCompactionMetadata metadata) {
|
||||
log.info("Committing Compaction " + commitTime);
|
||||
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
|
||||
|
||||
try {
|
||||
activeTimeline.saveAsComplete(
|
||||
new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime),
|
||||
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException(
|
||||
"Failed to commit " + metaClient.getBasePath() + " at time " + commitTime, e);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
@@ -32,6 +33,7 @@ import com.uber.hoodie.exception.HoodieUpsertException;
|
||||
import com.uber.hoodie.func.LazyInsertIterable;
|
||||
import com.uber.hoodie.io.HoodieUpdateHandle;
|
||||
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@@ -55,6 +57,7 @@ import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -470,4 +473,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
Partitioner partitioner) {
|
||||
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc) {
|
||||
logger.info("Nothing to compact in COW storage format");
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,11 +17,17 @@
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.io.HoodieAppendHandle;
|
||||
import com.uber.hoodie.io.compact.CompactionFilter;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import java.util.Optional;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -29,6 +35,7 @@ import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Implementation of a more real-time read-optimized Hoodie Table where
|
||||
@@ -59,4 +66,33 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
|
||||
.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc) {
|
||||
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
|
||||
Optional<HoodieInstant> lastCompaction = getActiveTimeline().getCompactionTimeline()
|
||||
.filterCompletedInstants().lastInstant();
|
||||
String deltaCommitsSinceTs = "0";
|
||||
if (lastCompaction.isPresent()) {
|
||||
deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
|
||||
}
|
||||
|
||||
int deltaCommitsSinceLastCompaction = getActiveTimeline().getDeltaCommitTimeline()
|
||||
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
|
||||
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
|
||||
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
|
||||
+ " delta commits was found since last compaction " + deltaCommitsSinceTs
|
||||
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
logger.info("Compacting merge on read table " + config.getBasePath());
|
||||
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||
try {
|
||||
return Optional.of(compactor.compact(jsc, config, this, CompactionFilter.allowAll()));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ package com.uber.hoodie.table;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
|
||||
import com.uber.hoodie.avro.model.HoodieSavepointPartitionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
@@ -34,6 +34,7 @@ import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
|
||||
import com.uber.hoodie.exception.HoodieSavepointException;
|
||||
import java.util.Optional;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.Partitioner;
|
||||
|
||||
@@ -41,9 +42,9 @@ import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Abstract implementation of a HoodieTable
|
||||
@@ -195,8 +196,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
return getActiveTimeline().getCommitTimeline();
|
||||
case MERGE_ON_READ:
|
||||
// We need to include the parquet files written out in delta commits
|
||||
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
|
||||
return getActiveTimeline().getTimelineOfActions(
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION,
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION,
|
||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION));
|
||||
default:
|
||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
||||
@@ -293,4 +295,10 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run Compaction on the table.
|
||||
* Compaction arranges the data so that it is optimized for data access
|
||||
*/
|
||||
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user