1
0

Introduce RealtimeTableView and Implement HoodieRealtimeTableCompactor (#73)

This commit is contained in:
prazanna
2017-02-06 14:32:32 -08:00
committed by Prasanna Rajaperumal
parent 48fbb0f425
commit 11d2fd3428
30 changed files with 1074 additions and 95 deletions

View File

@@ -64,7 +64,6 @@ import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
@@ -94,8 +93,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private transient final HoodieCommitArchiveLog archiveLog;
private transient Timer.Context writeContext = null;
private final SimpleDateFormat FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss");
/**
* @param jsc
* @param clientConfig
@@ -331,7 +328,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
clean();
if (writeContext != null) {
long durationInMs = metrics.getDurationInMs(writeContext.stop());
metrics.updateCommitMetrics(FORMATTER.parse(commitTime).getTime(), durationInMs,
metrics.updateCommitMetrics(
HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), durationInMs,
metadata);
writeContext = null;
}
@@ -495,7 +493,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Provides a new commit time for a write operation (insert/update)
*/
public String startCommit() {
String commitTime = FORMATTER.format(new Date());
String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
startCommitWithTime(commitTime);
return commitTime;
}

View File

@@ -0,0 +1,34 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import java.util.List;
/**
* Implementations of CompactionFilter allows prioritizing and filtering certain type of
* compactions over other compactions.
*
* e.g. Filter in-efficient compaction like compacting a very large old parquet file with a small avro file
*/
public interface CompactionFilter {
List<CompactionOperation> filter(List<CompactionOperation> input);
// Default implementation - do not filter anything
static CompactionFilter allowAll() {
return s -> s;
}
}

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import java.io.Serializable;
import java.util.List;
import java.util.stream.Collectors;
/**
* Encapsulates all the needed information about a compaction
* and make a decision whether this compaction is effective or not
*
* @see CompactionFilter
*/
public class CompactionOperation implements Serializable {
private String dataFileCommitTime;
private long dataFileSize;
private List<String> deltaFilePaths;
private String dataFilePath;
private String fileId;
private String partitionPath;
//Only for serialization/de-serialization
@Deprecated
public CompactionOperation() {
}
public CompactionOperation(HoodieDataFile dataFile, String partitionPath,
List<HoodieLogFile> value) {
this.dataFilePath = dataFile.getPath();
this.fileId = dataFile.getFileId();
this.partitionPath = partitionPath;
this.dataFileCommitTime = dataFile.getCommitTime();
this.dataFileSize = dataFile.getFileStatus().getLen();
this.deltaFilePaths = value.stream().map(s -> s.getPath().toString()).collect(
Collectors.toList());
}
public String getDataFileCommitTime() {
return dataFileCommitTime;
}
public long getDataFileSize() {
return dataFileSize;
}
public List<String> getDeltaFilePaths() {
return deltaFilePaths;
}
public String getDataFilePath() {
return dataFilePath;
}
public String getFileId() {
return fileId;
}
public String getPartitionPath() {
return partitionPath;
}
}

View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
/**
* Place holder for the compaction specific meta-data, uses all the details used in a normal HoodieCommitMetadata
*/
public class HoodieCompactionMetadata extends HoodieCommitMetadata {
}

View File

@@ -0,0 +1,53 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.RealtimeTableView;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.Optional;
/**
* A HoodieCompactor runs compaction on a hoodie table
*/
public interface HoodieCompactor extends Serializable {
/**
* Compact the delta files with the data files
* @throws Exception
*/
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
HoodieTableMetaClient metaClient, RealtimeTableView fsView,
CompactionFilter compactionFilter) throws Exception;
// Helper methods
default String startCompactionCommit(HoodieTableMetaClient metaClient) {
String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
activeTimeline
.createInflight(new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime));
return commitTime;
}
}

View File

@@ -0,0 +1,169 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieWriteStat;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.RealtimeTableView;
import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import org.apache.avro.Schema;
import org.apache.commons.collections.IteratorUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage.
* Computes all possible compactions, passes it through a CompactionFilter and executes
* all the compactions and writes a new version of base files and make a normal commit
*
* @see HoodieCompactor
*/
public class HoodieRealtimeTableCompactor implements HoodieCompactor {
private static Logger log = LogManager.getLogger(HoodieRealtimeTableCompactor.class);
@Override
public HoodieCompactionMetadata compact(JavaSparkContext jsc, HoodieWriteConfig config,
HoodieTableMetaClient metaClient, RealtimeTableView fsView,
CompactionFilter compactionFilter) throws Exception {
// TODO - rollback any compactions in flight
String compactionCommit = startCompactionCommit(metaClient);
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
List<String> partitionPaths =
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath());
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<CompactionOperation> operations =
jsc.parallelize(partitionPaths, partitionPaths.size())
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> {
FileSystem fileSystem = FSUtils.getFs();
return fsView.groupLatestDataFileWithLogFiles(fileSystem, partitionPath)
.entrySet().stream()
.map(s -> new CompactionOperation(s.getKey(), partitionPath, s.getValue()))
.collect(Collectors.toList());
}).collect();
log.info("Total of " + operations.size() + " compactions are retrieved");
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only
operations = compactionFilter.filter(operations);
if(operations.isEmpty()) {
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
return null;
}
log.info("After filtering, Compacting " + operations + " files");
List<Tuple2<String, HoodieWriteStat>> updateStatusMap =
jsc.parallelize(operations, operations.size()).map(
(Function<CompactionOperation, Iterator<List<WriteStatus>>>) compactionOperation -> executeCompaction(
metaClient, config, compactionOperation, compactionCommit)).flatMap(
(FlatMapFunction<Iterator<List<WriteStatus>>, WriteStatus>) listIterator -> {
List<List<WriteStatus>> collected = IteratorUtils.toList(listIterator);
return collected.stream().flatMap(List::stream).collect(Collectors.toList());
}).mapToPair(new PairFunction<WriteStatus, String, HoodieWriteStat>() {
@Override
public Tuple2<String, HoodieWriteStat> call(WriteStatus writeStatus)
throws Exception {
return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat());
}
}).collect();
HoodieCompactionMetadata metadata = new HoodieCompactionMetadata();
for (Tuple2<String, HoodieWriteStat> stat : updateStatusMap) {
metadata.addWriteStat(stat._1(), stat._2());
}
log.info("Compaction finished with result " + metadata);
//noinspection ConstantConditions
if (isCompactionSucceeded(metadata)) {
log.info("Compaction succeeded " + compactionCommit);
commitCompaction(compactionCommit, metaClient, metadata);
} else {
log.info("Compaction failed " + compactionCommit);
}
return metadata;
}
private boolean isCompactionSucceeded(HoodieCompactionMetadata result) {
//TODO figure out a success factor for a compaction
return true;
}
private Iterator<List<WriteStatus>> executeCompaction(HoodieTableMetaClient metaClient,
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
throws IOException {
FileSystem fs = FSUtils.getFs();
Schema schema =
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
.getDeltaFilePaths() + " for commit " + commitTime);
// TODO - FIX THIS
// 1. Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
// 2. naively loads all the delta records in memory to merge it,
// since we only need a iterator, we could implement a lazy iterator to load from one delta file at a time
List<HoodieRecord<HoodieAvroPayload>> readDeltaFilesInMemory =
AvroUtils.loadFromFiles(fs, operation.getDeltaFilePaths(), schema);
HoodieCopyOnWriteTable<HoodieAvroPayload> table =
new HoodieCopyOnWriteTable<>(commitTime, config, metaClient);
return table.handleUpdate(operation.getFileId(), readDeltaFilesInMemory.iterator());
}
public boolean commitCompaction(String commitTime, HoodieTableMetaClient metaClient,
HoodieCompactionMetadata metadata) {
log.info("Comitting " + commitTime);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
try {
activeTimeline.saveAsComplete(
new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException e) {
throw new HoodieCommitException(
"Failed to commit " + metaClient.getBasePath() + " at time " + commitTime, e);
}
return true;
}
}

View File

@@ -30,7 +30,6 @@ import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieInsertException;
import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.func.LazyInsertIterable;
import com.uber.hoodie.io.HoodieUpdateHandle;
@@ -390,7 +389,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile);
return null;
}
@Override
@@ -399,7 +398,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
public Iterator<List<WriteStatus>> handleUpdate(String fileLoc, Iterator<HoodieRecord<T>> recordItr) throws Exception {
public Iterator<List<WriteStatus>> handleUpdate(String fileLoc, Iterator<HoodieRecord<T>> recordItr)
throws IOException {
// these are updates
HoodieUpdateHandle upsertHandle =
new HoodieUpdateHandle<>(config, commitTime, metaClient, recordItr, fileLoc);
@@ -462,11 +463,4 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieUpsertException(msg, t);
}
}
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(Integer partition,
Iterator recordItr,
Partitioner partitioner) {
return handleUpsertPartition(partition, recordItr, partitioner);
}
}

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.common;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils;
@@ -66,7 +67,7 @@ public class HoodieTestDataGenerator {
private List<KeyPartition> existingKeysList = new ArrayList<>();
private static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
private static Random rand = new Random(46474747);
private String[] partitionPaths = {"2016/03/15", "2015/03/16", "2015/03/17"};

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io;
import com.uber.hoodie.HoodieReadClient;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.RealtimeTableView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieBloomIndex;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.compact.CompactionFilter;
import com.uber.hoodie.io.compact.HoodieCompactionMetadata;
import com.uber.hoodie.io.compact.HoodieCompactor;
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCompactor {
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
private String basePath = null;
private HoodieCompactor compactor;
private transient HoodieTestDataGenerator dataGen = null;
@Before
public void init() throws IOException {
// Initialize a local spark env
SparkConf sparkConf =
new SparkConf().setAppName("TestHoodieCompactor").setMaster("local[4]");
jsc = new JavaSparkContext(HoodieReadClient.addHoodieSupport(sparkConf));
// Create a temp folder as the base path
TemporaryFolder folder = new TemporaryFolder();
folder.create();
basePath = folder.getRoot().getAbsolutePath();
HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
dataGen = new HoodieTestDataGenerator();
compactor = new HoodieRealtimeTableCompactor();
}
@After
public void clean() {
if (basePath != null) {
new File(basePath).delete();
}
if (jsc != null) {
jsc.stop();
}
}
private HoodieWriteConfig getConfig() {
return getConfigBuilder().build();
}
private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
}
@Test(expected = IllegalArgumentException.class)
public void testCompactionOnCopyOnWriteFail() throws Exception {
HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
RealtimeTableView fsView = new RealtimeTableView(FSUtils.getFs(), metaClient);
compactor.compact(jsc, getConfig(), metaClient, fsView, CompactionFilter.allowAll());
}
@Test
public void testCompactionEmpty() throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
RealtimeTableView fsView = new RealtimeTableView(FSUtils.getFs(), metaClient);
HoodieWriteConfig config = getConfig();
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
String newCommitTime = writeClient.startCommit();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
writeClient.insert(recordsRDD, newCommitTime).collect();
HoodieCompactionMetadata result =
compactor.compact(jsc, getConfig(), metaClient, fsView, CompactionFilter.allowAll());
assertTrue("If there is nothing to compact, result wull be null", result == null);
}
@Test
public void testLogFileCountsAfterCompaction() throws Exception {
FileSystem fs = FSUtils.getFs();
// insert 100 records
HoodieWriteConfig config = getConfig();
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
String newCommitTime = "100";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
newCommitTime = "101";
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
HoodieIndex index = new HoodieBloomIndex<>(config, jsc);
updatedRecords = index.tagLocation(updatedRecordsRDD, metaClient).collect();
// Write them to corresponding avro logfiles
HoodieTestUtils
.writeRecordsToLogFiles(metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
updatedRecords);
// Verify that all data file has one log file
metaClient = new HoodieTableMetaClient(fs, basePath);
RealtimeTableView fsView = new RealtimeTableView(fs, metaClient);
for (String partitionPath : dataGen.getPartitionPaths()) {
Map<HoodieDataFile, List<HoodieLogFile>> groupedLogFiles =
fsView.groupLatestDataFileWithLogFiles(fs, partitionPath);
for (List<HoodieLogFile> logFiles : groupedLogFiles.values()) {
assertEquals("There should be 1 log file written for every data file", 1,
logFiles.size());
}
}
// Do a compaction
metaClient = new HoodieTableMetaClient(fs, basePath);
fsView = new RealtimeTableView(fs, metaClient);
HoodieCompactionMetadata result =
compactor.compact(jsc, getConfig(), metaClient, fsView, CompactionFilter.allowAll());
// Verify that recently written compacted data file has no log file
metaClient = new HoodieTableMetaClient(fs, basePath);
fsView = new RealtimeTableView(fs, metaClient);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
assertTrue("Compaction commit should be > than last insert", timeline
.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime,
HoodieTimeline.GREATER));
for (String partitionPath : dataGen.getPartitionPaths()) {
Map<HoodieDataFile, List<HoodieLogFile>> groupedLogFiles =
fsView.groupLatestDataFileWithLogFiles(fs, partitionPath);
for (List<HoodieLogFile> logFiles : groupedLogFiles.values()) {
assertTrue(
"After compaction there should be no log files visiable on a Realtime view",
logFiles.isEmpty());
}
assertTrue(result.getPartitionToWriteStats().containsKey(partitionPath));
}
}
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness)
// TODO - add more test cases for compactions after a failed commit/compaction
}