1
0

Introduce RealtimeTableView and Implement HoodieRealtimeTableCompactor (#73)

This commit is contained in:
prazanna
2017-02-06 14:32:32 -08:00
committed by Prasanna Rajaperumal
parent 48fbb0f425
commit 11d2fd3428
30 changed files with 1074 additions and 95 deletions

View File

@@ -64,7 +64,6 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections; import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
@@ -94,8 +93,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private transient final HoodieCommitArchiveLog archiveLog; private transient final HoodieCommitArchiveLog archiveLog;
private transient Timer.Context writeContext = null; private transient Timer.Context writeContext = null;
private final SimpleDateFormat FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss");
/** /**
* @param jsc * @param jsc
* @param clientConfig * @param clientConfig
@@ -331,7 +328,8 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
clean(); clean();
if (writeContext != null) { if (writeContext != null) {
long durationInMs = metrics.getDurationInMs(writeContext.stop()); long durationInMs = metrics.getDurationInMs(writeContext.stop());
metrics.updateCommitMetrics(FORMATTER.parse(commitTime).getTime(), durationInMs, metrics.updateCommitMetrics(
HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), durationInMs,
metadata); metadata);
writeContext = null; writeContext = null;
} }
@@ -495,7 +493,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Provides a new commit time for a write operation (insert/update) * Provides a new commit time for a write operation (insert/update)
*/ */
public String startCommit() { public String startCommit() {
String commitTime = FORMATTER.format(new Date()); String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
startCommitWithTime(commitTime); startCommitWithTime(commitTime);
return commitTime; return commitTime;
} }

View File

@@ -0,0 +1,34 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import java.util.List;
/**
* Implementations of CompactionFilter allows prioritizing and filtering certain type of
* compactions over other compactions.
*
* e.g. Filter in-efficient compaction like compacting a very large old parquet file with a small avro file
*/
public interface CompactionFilter {
List<CompactionOperation> filter(List<CompactionOperation> input);
// Default implementation - do not filter anything
static CompactionFilter allowAll() {
return s -> s;
}
}

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import java.io.Serializable;
import java.util.List;
import java.util.stream.Collectors;
/**
* Encapsulates all the needed information about a compaction
* and make a decision whether this compaction is effective or not
*
* @see CompactionFilter
*/
public class CompactionOperation implements Serializable {
private String dataFileCommitTime;
private long dataFileSize;
private List<String> deltaFilePaths;
private String dataFilePath;
private String fileId;
private String partitionPath;
//Only for serialization/de-serialization
@Deprecated
public CompactionOperation() {
}
public CompactionOperation(HoodieDataFile dataFile, String partitionPath,
List<HoodieLogFile> value) {
this.dataFilePath = dataFile.getPath();
this.fileId = dataFile.getFileId();
this.partitionPath = partitionPath;
this.dataFileCommitTime = dataFile.getCommitTime();
this.dataFileSize = dataFile.getFileStatus().getLen();
this.deltaFilePaths = value.stream().map(s -> s.getPath().toString()).collect(
Collectors.toList());
}
public String getDataFileCommitTime() {
return dataFileCommitTime;
}
public long getDataFileSize() {
return dataFileSize;
}
public List<String> getDeltaFilePaths() {
return deltaFilePaths;
}
public String getDataFilePath() {
return dataFilePath;
}
public String getFileId() {
return fileId;
}
public String getPartitionPath() {
return partitionPath;
}
}

View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
/**
* Place holder for the compaction specific meta-data, uses all the details used in a normal HoodieCommitMetadata
*/
public class HoodieCompactionMetadata extends HoodieCommitMetadata {
}

View File

@@ -0,0 +1,53 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.RealtimeTableView;
import com.uber.hoodie.config.HoodieWriteConfig;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.Optional;
/**
* A HoodieCompactor runs compaction on a hoodie table
*/
public interface HoodieCompactor extends Serializable {
/**
* Compact the delta files with the data files
* @throws Exception
*/
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
HoodieTableMetaClient metaClient, RealtimeTableView fsView,
CompactionFilter compactionFilter) throws Exception;
// Helper methods
default String startCompactionCommit(HoodieTableMetaClient metaClient) {
String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
activeTimeline
.createInflight(new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime));
return commitTime;
}
}

View File

@@ -0,0 +1,169 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieWriteStat;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.RealtimeTableView;
import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCommitException;
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import org.apache.avro.Schema;
import org.apache.commons.collections.IteratorUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage.
* Computes all possible compactions, passes it through a CompactionFilter and executes
* all the compactions and writes a new version of base files and make a normal commit
*
* @see HoodieCompactor
*/
public class HoodieRealtimeTableCompactor implements HoodieCompactor {
private static Logger log = LogManager.getLogger(HoodieRealtimeTableCompactor.class);
@Override
public HoodieCompactionMetadata compact(JavaSparkContext jsc, HoodieWriteConfig config,
HoodieTableMetaClient metaClient, RealtimeTableView fsView,
CompactionFilter compactionFilter) throws Exception {
// TODO - rollback any compactions in flight
String compactionCommit = startCompactionCommit(metaClient);
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
List<String> partitionPaths =
FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath());
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
List<CompactionOperation> operations =
jsc.parallelize(partitionPaths, partitionPaths.size())
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> {
FileSystem fileSystem = FSUtils.getFs();
return fsView.groupLatestDataFileWithLogFiles(fileSystem, partitionPath)
.entrySet().stream()
.map(s -> new CompactionOperation(s.getKey(), partitionPath, s.getValue()))
.collect(Collectors.toList());
}).collect();
log.info("Total of " + operations.size() + " compactions are retrieved");
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only
operations = compactionFilter.filter(operations);
if(operations.isEmpty()) {
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
return null;
}
log.info("After filtering, Compacting " + operations + " files");
List<Tuple2<String, HoodieWriteStat>> updateStatusMap =
jsc.parallelize(operations, operations.size()).map(
(Function<CompactionOperation, Iterator<List<WriteStatus>>>) compactionOperation -> executeCompaction(
metaClient, config, compactionOperation, compactionCommit)).flatMap(
(FlatMapFunction<Iterator<List<WriteStatus>>, WriteStatus>) listIterator -> {
List<List<WriteStatus>> collected = IteratorUtils.toList(listIterator);
return collected.stream().flatMap(List::stream).collect(Collectors.toList());
}).mapToPair(new PairFunction<WriteStatus, String, HoodieWriteStat>() {
@Override
public Tuple2<String, HoodieWriteStat> call(WriteStatus writeStatus)
throws Exception {
return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat());
}
}).collect();
HoodieCompactionMetadata metadata = new HoodieCompactionMetadata();
for (Tuple2<String, HoodieWriteStat> stat : updateStatusMap) {
metadata.addWriteStat(stat._1(), stat._2());
}
log.info("Compaction finished with result " + metadata);
//noinspection ConstantConditions
if (isCompactionSucceeded(metadata)) {
log.info("Compaction succeeded " + compactionCommit);
commitCompaction(compactionCommit, metaClient, metadata);
} else {
log.info("Compaction failed " + compactionCommit);
}
return metadata;
}
private boolean isCompactionSucceeded(HoodieCompactionMetadata result) {
//TODO figure out a success factor for a compaction
return true;
}
private Iterator<List<WriteStatus>> executeCompaction(HoodieTableMetaClient metaClient,
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
throws IOException {
FileSystem fs = FSUtils.getFs();
Schema schema =
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
.getDeltaFilePaths() + " for commit " + commitTime);
// TODO - FIX THIS
// 1. Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
// 2. naively loads all the delta records in memory to merge it,
// since we only need a iterator, we could implement a lazy iterator to load from one delta file at a time
List<HoodieRecord<HoodieAvroPayload>> readDeltaFilesInMemory =
AvroUtils.loadFromFiles(fs, operation.getDeltaFilePaths(), schema);
HoodieCopyOnWriteTable<HoodieAvroPayload> table =
new HoodieCopyOnWriteTable<>(commitTime, config, metaClient);
return table.handleUpdate(operation.getFileId(), readDeltaFilesInMemory.iterator());
}
public boolean commitCompaction(String commitTime, HoodieTableMetaClient metaClient,
HoodieCompactionMetadata metadata) {
log.info("Comitting " + commitTime);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
try {
activeTimeline.saveAsComplete(
new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException e) {
throw new HoodieCommitException(
"Failed to commit " + metaClient.getBasePath() + " at time " + commitTime, e);
}
return true;
}
}

View File

@@ -30,7 +30,6 @@ import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation; import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload; import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieInsertException;
import com.uber.hoodie.exception.HoodieUpsertException; import com.uber.hoodie.exception.HoodieUpsertException;
import com.uber.hoodie.func.LazyInsertIterable; import com.uber.hoodie.func.LazyInsertIterable;
import com.uber.hoodie.io.HoodieUpdateHandle; import com.uber.hoodie.io.HoodieUpdateHandle;
@@ -390,7 +389,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override @Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) { public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile); return null;
} }
@Override @Override
@@ -399,7 +398,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
} }
public Iterator<List<WriteStatus>> handleUpdate(String fileLoc, Iterator<HoodieRecord<T>> recordItr) throws Exception {
public Iterator<List<WriteStatus>> handleUpdate(String fileLoc, Iterator<HoodieRecord<T>> recordItr)
throws IOException {
// these are updates // these are updates
HoodieUpdateHandle upsertHandle = HoodieUpdateHandle upsertHandle =
new HoodieUpdateHandle<>(config, commitTime, metaClient, recordItr, fileLoc); new HoodieUpdateHandle<>(config, commitTime, metaClient, recordItr, fileLoc);
@@ -462,11 +463,4 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieUpsertException(msg, t); throw new HoodieUpsertException(msg, t);
} }
} }
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(Integer partition,
Iterator recordItr,
Partitioner partitioner) {
return handleUpsertPartition(partition, recordItr, partitioner);
}
} }

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.common;
import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
@@ -66,7 +67,7 @@ public class HoodieTestDataGenerator {
private List<KeyPartition> existingKeysList = new ArrayList<>(); private List<KeyPartition> existingKeysList = new ArrayList<>();
private static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
private static Random rand = new Random(46474747); private static Random rand = new Random(46474747);
private String[] partitionPaths = {"2016/03/15", "2015/03/16", "2015/03/17"}; private String[] partitionPaths = {"2016/03/15", "2015/03/16", "2015/03/17"};

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io;
import com.uber.hoodie.HoodieReadClient;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.RealtimeTableView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieBloomIndex;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.io.compact.CompactionFilter;
import com.uber.hoodie.io.compact.HoodieCompactionMetadata;
import com.uber.hoodie.io.compact.HoodieCompactor;
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestHoodieCompactor {
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
private String basePath = null;
private HoodieCompactor compactor;
private transient HoodieTestDataGenerator dataGen = null;
@Before
public void init() throws IOException {
// Initialize a local spark env
SparkConf sparkConf =
new SparkConf().setAppName("TestHoodieCompactor").setMaster("local[4]");
jsc = new JavaSparkContext(HoodieReadClient.addHoodieSupport(sparkConf));
// Create a temp folder as the base path
TemporaryFolder folder = new TemporaryFolder();
folder.create();
basePath = folder.getRoot().getAbsolutePath();
HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
dataGen = new HoodieTestDataGenerator();
compactor = new HoodieRealtimeTableCompactor();
}
@After
public void clean() {
if (basePath != null) {
new File(basePath).delete();
}
if (jsc != null) {
jsc.stop();
}
}
private HoodieWriteConfig getConfig() {
return getConfigBuilder().build();
}
private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
}
@Test(expected = IllegalArgumentException.class)
public void testCompactionOnCopyOnWriteFail() throws Exception {
HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
RealtimeTableView fsView = new RealtimeTableView(FSUtils.getFs(), metaClient);
compactor.compact(jsc, getConfig(), metaClient, fsView, CompactionFilter.allowAll());
}
@Test
public void testCompactionEmpty() throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
RealtimeTableView fsView = new RealtimeTableView(FSUtils.getFs(), metaClient);
HoodieWriteConfig config = getConfig();
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
String newCommitTime = writeClient.startCommit();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
writeClient.insert(recordsRDD, newCommitTime).collect();
HoodieCompactionMetadata result =
compactor.compact(jsc, getConfig(), metaClient, fsView, CompactionFilter.allowAll());
assertTrue("If there is nothing to compact, result wull be null", result == null);
}
@Test
public void testLogFileCountsAfterCompaction() throws Exception {
FileSystem fs = FSUtils.getFs();
// insert 100 records
HoodieWriteConfig config = getConfig();
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
String newCommitTime = "100";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
newCommitTime = "101";
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
HoodieIndex index = new HoodieBloomIndex<>(config, jsc);
updatedRecords = index.tagLocation(updatedRecordsRDD, metaClient).collect();
// Write them to corresponding avro logfiles
HoodieTestUtils
.writeRecordsToLogFiles(metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
updatedRecords);
// Verify that all data file has one log file
metaClient = new HoodieTableMetaClient(fs, basePath);
RealtimeTableView fsView = new RealtimeTableView(fs, metaClient);
for (String partitionPath : dataGen.getPartitionPaths()) {
Map<HoodieDataFile, List<HoodieLogFile>> groupedLogFiles =
fsView.groupLatestDataFileWithLogFiles(fs, partitionPath);
for (List<HoodieLogFile> logFiles : groupedLogFiles.values()) {
assertEquals("There should be 1 log file written for every data file", 1,
logFiles.size());
}
}
// Do a compaction
metaClient = new HoodieTableMetaClient(fs, basePath);
fsView = new RealtimeTableView(fs, metaClient);
HoodieCompactionMetadata result =
compactor.compact(jsc, getConfig(), metaClient, fsView, CompactionFilter.allowAll());
// Verify that recently written compacted data file has no log file
metaClient = new HoodieTableMetaClient(fs, basePath);
fsView = new RealtimeTableView(fs, metaClient);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
assertTrue("Compaction commit should be > than last insert", timeline
.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime,
HoodieTimeline.GREATER));
for (String partitionPath : dataGen.getPartitionPaths()) {
Map<HoodieDataFile, List<HoodieLogFile>> groupedLogFiles =
fsView.groupLatestDataFileWithLogFiles(fs, partitionPath);
for (List<HoodieLogFile> logFiles : groupedLogFiles.values()) {
assertTrue(
"After compaction there should be no log files visiable on a Realtime view",
logFiles.isEmpty());
}
assertTrue(result.getPartitionToWriteStats().containsKey(partitionPath));
}
}
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness)
// TODO - add more test cases for compactions after a failed commit/compaction
}

View File

@@ -0,0 +1,52 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.model;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import java.io.IOException;
/**
* This is a payload to wrap a existing Hoodie Avro Record.
* Useful to create a HoodieRecord over existing GenericRecords in a hoodie datasets (useful in compactions)
*/
public class HoodieAvroPayload implements HoodieRecordPayload<HoodieAvroPayload> {
private final GenericRecord record;
public HoodieAvroPayload(GenericRecord record) {
this.record = record;
}
@Override
public HoodieAvroPayload preCombine(HoodieAvroPayload another) {
return this;
}
@Override
public IndexedRecord combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema)
throws IOException {
return getInsertValue(schema);
}
@Override
public IndexedRecord getInsertValue(Schema schema) throws IOException {
return HoodieAvroUtils.rewriteRecord(record, schema);
}
}

View File

@@ -17,7 +17,7 @@
package com.uber.hoodie.common.model; package com.uber.hoodie.common.model;
public enum HoodieFileFormat { public enum HoodieFileFormat {
PARQUET(".parquet"); PARQUET(".parquet"), AVRO(".avro");
private final String extension; private final String extension;

View File

@@ -31,5 +31,5 @@ package com.uber.hoodie.common.model;
* SIMPLE_LSM - A simple 2 level LSM tree. * SIMPLE_LSM - A simple 2 level LSM tree.
*/ */
public enum HoodieTableType { public enum HoodieTableType {
COPY_ON_WRITE COPY_ON_WRITE, MERGE_ON_READ
} }

View File

@@ -47,8 +47,12 @@ public class HoodieTableConfig implements Serializable {
public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type";
public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME =
"hoodie.table.ro.file.format"; "hoodie.table.ro.file.format";
public static final String HOODIE_RT_FILE_FORMAT_PROP_NAME =
"hoodie.table.rt.file.format";
public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE; public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE;
public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET; public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET;
public static final HoodieFileFormat DEFAULT_RT_FILE_FORMAT = HoodieFileFormat.AVRO;
private Properties props; private Properties props;
public HoodieTableConfig(FileSystem fs, String metaPath) { public HoodieTableConfig(FileSystem fs, String metaPath) {
@@ -108,7 +112,7 @@ public class HoodieTableConfig implements Serializable {
* @return * @return
*/ */
public HoodieTableType getTableType() { public HoodieTableType getTableType() {
if (props.contains(HOODIE_TABLE_TYPE_PROP_NAME)) { if (props.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) {
return HoodieTableType.valueOf(props.getProperty(HOODIE_TABLE_TYPE_PROP_NAME)); return HoodieTableType.valueOf(props.getProperty(HOODIE_TABLE_TYPE_PROP_NAME));
} }
return DEFAULT_TABLE_TYPE; return DEFAULT_TABLE_TYPE;
@@ -129,9 +133,22 @@ public class HoodieTableConfig implements Serializable {
* @return HoodieFileFormat for the Read Optimized Storage format * @return HoodieFileFormat for the Read Optimized Storage format
*/ */
public HoodieFileFormat getROFileFormat() { public HoodieFileFormat getROFileFormat() {
if (props.contains(HOODIE_RO_FILE_FORMAT_PROP_NAME)) { if (props.containsKey(HOODIE_RO_FILE_FORMAT_PROP_NAME)) {
return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RO_FILE_FORMAT_PROP_NAME)); return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RO_FILE_FORMAT_PROP_NAME));
} }
return DEFAULT_RO_FILE_FORMAT; return DEFAULT_RO_FILE_FORMAT;
} }
/**
* Get the Read Optimized Storage Format
*
* @return HoodieFileFormat for the Read Optimized Storage format
*/
public HoodieFileFormat getRTFileFormat() {
if (props.containsKey(HOODIE_RT_FILE_FORMAT_PROP_NAME)) {
return HoodieFileFormat.valueOf(props.getProperty(HOODIE_RT_FILE_FORMAT_PROP_NAME));
}
return DEFAULT_RT_FILE_FORMAT;
}
} }

View File

@@ -137,6 +137,14 @@ public class HoodieTableMetaClient implements Serializable {
return tableConfig; return tableConfig;
} }
/**
* Get the FS implementation for this table
* @return
*/
public FileSystem getFs() {
return fs;
}
/** /**
* Get the active instants as a timeline * Get the active instants as a timeline
* *

View File

@@ -41,15 +41,17 @@ public interface HoodieTimeline extends Serializable {
String COMMIT_ACTION = "commit"; String COMMIT_ACTION = "commit";
String CLEAN_ACTION = "clean"; String CLEAN_ACTION = "clean";
String SAVEPOINT_ACTION = "savepoint"; String SAVEPOINT_ACTION = "savepoint";
String COMPACTION_ACTION = "compaction";
String INFLIGHT_EXTENSION = ".inflight"; String INFLIGHT_EXTENSION = ".inflight";
String COMMIT_EXTENSION = "." + COMMIT_ACTION; String COMMIT_EXTENSION = "." + COMMIT_ACTION;
String CLEAN_EXTENSION = "." + CLEAN_ACTION; String CLEAN_EXTENSION = "." + CLEAN_ACTION;
String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION; String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION;
String COMPACTION_EXTENSION = "." + COMPACTION_ACTION;
//this is to preserve backwards compatibility on commit in-flight filenames //this is to preserve backwards compatibility on commit in-flight filenames
String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION; String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION;
String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION; String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION;
String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION; String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION;
String INFLIGHT_COMPACTION_EXTENSION = "." + COMPACTION_ACTION + INFLIGHT_EXTENSION;
/** /**
* Filter this timeline to just include the in-flights * Filter this timeline to just include the in-flights
@@ -193,6 +195,14 @@ public interface HoodieTimeline extends Serializable {
return commitTime + HoodieTimeline.SAVEPOINT_EXTENSION; return commitTime + HoodieTimeline.SAVEPOINT_EXTENSION;
} }
static String makeInflightCompactionFileName(String commitTime) {
return commitTime + HoodieTimeline.INFLIGHT_COMPACTION_EXTENSION;
}
static String makeCompactionFileName(String commitTime) {
return commitTime + HoodieTimeline.COMPACTION_EXTENSION;
}
static String getCommitFromCommitFile(String commitFileName) { static String getCommitFromCommitFile(String commitFileName) {
return commitFileName.split("\\.")[0]; return commitFileName.split("\\.")[0];
} }

View File

@@ -117,6 +117,8 @@ public class HoodieLogAppendConfig {
private Integer fileVersion; private Integer fileVersion;
// Partition path for the log file // Partition path for the log file
private Path partitionPath; private Path partitionPath;
// The base commit time for which the log files are accumulated
private String baseCommitTime;
public Builder withBufferSize(int bufferSize) { public Builder withBufferSize(int bufferSize) {
this.bufferSize = bufferSize; this.bufferSize = bufferSize;
@@ -173,6 +175,11 @@ public class HoodieLogAppendConfig {
return this; return this;
} }
public Builder withBaseCommitTime(String commitTime) {
this.baseCommitTime = commitTime;
return this;
}
public HoodieLogAppendConfig build() throws IOException { public HoodieLogAppendConfig build() throws IOException {
log.info("Building HoodieLogAppendConfig"); log.info("Building HoodieLogAppendConfig");
if (schema == null) { if (schema == null) {
@@ -185,6 +192,9 @@ public class HoodieLogAppendConfig {
if (fileId == null) { if (fileId == null) {
throw new IllegalArgumentException("FileID is not specified"); throw new IllegalArgumentException("FileID is not specified");
} }
if (baseCommitTime == null) {
throw new IllegalArgumentException("BaseCommitTime is not specified");
}
if (logFileExtension == null) { if (logFileExtension == null) {
throw new IllegalArgumentException("File extension is not specified"); throw new IllegalArgumentException("File extension is not specified");
} }
@@ -194,14 +204,14 @@ public class HoodieLogAppendConfig {
if (fileVersion == null) { if (fileVersion == null) {
log.info("Computing the next log version for " + fileId + " in " + partitionPath); log.info("Computing the next log version for " + fileId + " in " + partitionPath);
fileVersion = fileVersion =
FSUtils.getCurrentLogVersion(fs, partitionPath, fileId, logFileExtension); FSUtils.getCurrentLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
log.info( log.info(
"Computed the next log version for " + fileId + " in " + partitionPath + " as " "Computed the next log version for " + fileId + " in " + partitionPath + " as "
+ fileVersion); + fileVersion);
} }
Path logPath = new Path(partitionPath, Path logPath = new Path(partitionPath,
FSUtils.makeLogFileName(fileId, logFileExtension, fileVersion)); FSUtils.makeLogFileName(fileId, logFileExtension, baseCommitTime, fileVersion));
log.info("LogConfig created on path " + logPath); log.info("LogConfig created on path " + logPath);
HoodieLogFile logFile = new HoodieLogFile(logPath); HoodieLogFile logFile = new HoodieLogFile(logPath);

View File

@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.util.Comparator;
import java.util.Optional; import java.util.Optional;
/** /**
@@ -51,6 +51,10 @@ public class HoodieLogFile {
return FSUtils.getFileIdFromLogPath(path); return FSUtils.getFileIdFromLogPath(path);
} }
public String getBaseCommitTime() {
return FSUtils.getBaseCommitTimeFromLogPath(path);
}
public int getLogVersion() { public int getLogVersion() {
return FSUtils.getFileVersionFromLog(path); return FSUtils.getFileVersionFromLog(path);
} }
@@ -74,10 +78,12 @@ public class HoodieLogFile {
public HoodieLogFile rollOver(FileSystem fs) throws IOException { public HoodieLogFile rollOver(FileSystem fs) throws IOException {
String fileId = getFileId(); String fileId = getFileId();
int newVersion = String baseCommitTime = getBaseCommitTime();
FSUtils.computeNextLogVersion(fs, path.getParent(), fileId, DELTA_EXTENSION); int newVersion = FSUtils
.computeNextLogVersion(fs, path.getParent(), fileId,
DELTA_EXTENSION, baseCommitTime);
return new HoodieLogFile(new Path(path.getParent(), return new HoodieLogFile(new Path(path.getParent(),
FSUtils.makeLogFileName(fileId, DELTA_EXTENSION, newVersion))); FSUtils.makeLogFileName(fileId, DELTA_EXTENSION, baseCommitTime, newVersion)));
} }
public boolean shouldRollOver(HoodieLogAppender currentWriter, HoodieLogAppendConfig config) public boolean shouldRollOver(HoodieLogAppender currentWriter, HoodieLogAppendConfig config)
@@ -85,6 +91,14 @@ public class HoodieLogFile {
return currentWriter.getCurrentSize() > config.getSizeThreshold(); return currentWriter.getCurrentSize() > config.getSizeThreshold();
} }
public static Comparator<HoodieLogFile> getLogVersionComparator() {
return (o1, o2) -> {
// reverse the order
return new Integer(o2.getLogVersion()).compareTo(o1.getLogVersion());
};
}
@Override @Override
public String toString() { public String toString() {
return "HoodieLogFile{" + path + '}'; return "HoodieLogFile{" + path + '}';

View File

@@ -43,10 +43,10 @@ import java.util.stream.Stream;
public class CompositeAvroLogReader { public class CompositeAvroLogReader {
private final Map<Integer, AvroLogReader> readers; private final Map<Integer, AvroLogReader> readers;
public CompositeAvroLogReader(Path partitionPath, String fileId, FileSystem fs, public CompositeAvroLogReader(Path partitionPath, String fileId, String baseCommitTime, FileSystem fs,
Schema readerSchema, String logFileExtension) throws IOException { Schema readerSchema, String logFileExtension) throws IOException {
Stream<HoodieLogFile> allLogFiles = Stream<HoodieLogFile> allLogFiles =
FSUtils.getAllLogFiles(fs, partitionPath, fileId, logFileExtension); FSUtils.getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
this.readers = allLogFiles.map(hoodieLogFile -> { this.readers = allLogFiles.map(hoodieLogFile -> {
try { try {
return new AvroLogReader(hoodieLogFile, fs, readerSchema); return new AvroLogReader(hoodieLogFile, fs, readerSchema);

View File

@@ -31,9 +31,11 @@ import org.apache.log4j.Logger;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import java.util.Optional; import java.util.Optional;
import java.util.Set;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@@ -50,6 +52,8 @@ import java.util.stream.Stream;
* This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized. * This class can be serialized and de-serialized and on de-serialization the FileSystem is re-initialized.
*/ */
public class HoodieActiveTimeline extends HoodieDefaultTimeline { public class HoodieActiveTimeline extends HoodieDefaultTimeline {
public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss");
private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class); private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class);
private String metaPath; private String metaPath;
private transient FileSystem fs; private transient FileSystem fs;
@@ -81,8 +85,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
public HoodieActiveTimeline(FileSystem fs, String metaPath) { public HoodieActiveTimeline(FileSystem fs, String metaPath) {
this(fs, metaPath, this(fs, metaPath,
new String[] {COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, new String[] {COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, COMPACTION_EXTENSION,
INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION}); INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, COMPACTION_EXTENSION});
} }
/** /**
@@ -113,6 +117,27 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
(Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails); (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails);
} }
/**
* Get only the commits (inflight and completed) in the compaction timeline
*
* @return
*/
public HoodieTimeline getCompactionTimeline() {
return new HoodieDefaultTimeline(filterInstantsByAction(COMPACTION_ACTION),
(Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails);
}
/**
* Get a timeline of a specific set of actions. useful to create a merged timeline of multiple actions
*
* @param actions actions allowed in the timeline
* @return
*/
public HoodieTimeline getTimelineOfActions(Set<String> actions) {
return new HoodieDefaultTimeline(instants.stream().filter(s -> actions.contains(s.getAction())),
(Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails);
}
/** /**
* Get only the cleaner action (inflight and completed) in the active timeline * Get only the cleaner action (inflight and completed) in the active timeline
* *

View File

@@ -93,6 +93,10 @@ public class HoodieInstant implements Serializable {
return isInflight ? return isInflight ?
HoodieTimeline.makeInflightSavePointFileName(timestamp) : HoodieTimeline.makeInflightSavePointFileName(timestamp) :
HoodieTimeline.makeSavePointFileName(timestamp); HoodieTimeline.makeSavePointFileName(timestamp);
} else if (HoodieTimeline.COMPACTION_ACTION.equals(action)) {
return isInflight ?
HoodieTimeline.makeInflightCompactionFileName(timestamp) :
HoodieTimeline.makeCompactionFileName(timestamp);
} }
throw new IllegalArgumentException("Cannot get file name for unknown action " + action); throw new IllegalArgumentException("Cannot get file name for unknown action " + action);
} }

View File

@@ -21,12 +21,15 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.TableFileSystemView; import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieIOException;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@@ -47,22 +50,38 @@ import java.util.stream.Stream;
* @see ReadOptimizedTableView * @see ReadOptimizedTableView
* @since 0.3.0 * @since 0.3.0
*/ */
public abstract class AbstractTableFileSystemView implements TableFileSystemView { public abstract class AbstractTableFileSystemView implements TableFileSystemView, Serializable {
protected final HoodieTableMetaClient metaClient; protected HoodieTableMetaClient metaClient;
protected final transient FileSystem fs; protected transient FileSystem fs;
protected final HoodieTimeline activeCommitTimeline; // This is the commits that will be visible for all views extending this view
protected HoodieTimeline visibleActiveCommitTimeline;
public AbstractTableFileSystemView(FileSystem fs, HoodieTableMetaClient metaClient) { public AbstractTableFileSystemView(FileSystem fs, HoodieTableMetaClient metaClient,
HoodieTimeline visibleActiveCommitTimeline) {
this.metaClient = metaClient; this.metaClient = metaClient;
this.fs = fs; this.fs = fs;
// Get the active timeline and filter only completed commits this.visibleActiveCommitTimeline = visibleActiveCommitTimeline;
this.activeCommitTimeline = }
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
/**
* This method is only used when this object is deserialized in a spark executor.
*
* @deprecated
*/
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
this.fs = FSUtils.getFs();
}
private void writeObject(java.io.ObjectOutputStream out)
throws IOException {
out.defaultWriteObject();
} }
public Stream<HoodieDataFile> getLatestDataFilesForFileId(final String partitionPath, public Stream<HoodieDataFile> getLatestDataFilesForFileId(final String partitionPath,
String fileId) { String fileId) {
Optional<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant(); Optional<HoodieInstant> lastInstant = visibleActiveCommitTimeline.lastInstant();
if (lastInstant.isPresent()) { if (lastInstant.isPresent()) {
return getLatestVersionInPartition(partitionPath, lastInstant.get().getTimestamp()) return getLatestVersionInPartition(partitionPath, lastInstant.get().getTimestamp())
.filter(hoodieDataFile -> hoodieDataFile.getFileId().equals(fileId)); .filter(hoodieDataFile -> hoodieDataFile.getFileId().equals(fileId));
@@ -73,22 +92,17 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView
@Override @Override
public Stream<HoodieDataFile> getLatestVersionInPartition(String partitionPathStr, public Stream<HoodieDataFile> getLatestVersionInPartition(String partitionPathStr,
String maxCommitTime) { String maxCommitTime) {
try {
return getLatestVersionsBeforeOrOn(listDataFilesInPartition(partitionPathStr), return getLatestVersionsBeforeOrOn(listDataFilesInPartition(partitionPathStr),
maxCommitTime); maxCommitTime);
} catch (IOException e) {
throw new HoodieIOException(
"Could not get latest versions in Partition " + partitionPathStr, e);
}
} }
@Override @Override
public Stream<List<HoodieDataFile>> getEveryVersionInPartition(String partitionPath) { public Stream<List<HoodieDataFile>> getEveryVersionInPartition(String partitionPath) {
try { try {
if (activeCommitTimeline.lastInstant().isPresent()) { if (visibleActiveCommitTimeline.lastInstant().isPresent()) {
return getFilesByFileId(listDataFilesInPartition(partitionPath), return getFilesByFileId(listDataFilesInPartition(partitionPath),
activeCommitTimeline.lastInstant().get().getTimestamp()); visibleActiveCommitTimeline.lastInstant().get().getTimestamp());
} }
return Stream.empty(); return Stream.empty();
} catch (IOException e) { } catch (IOException e) {
@@ -97,18 +111,26 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView
} }
} }
protected abstract FileStatus[] listDataFilesInPartition(String partitionPathStr) protected FileStatus[] listDataFilesInPartition(String partitionPathStr) {
throws IOException; Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr);
try {
return fs.listStatus(partitionPath, path -> path.getName()
.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()));
} catch (IOException e) {
throw new HoodieIOException(
"Failed to list data files in partition " + partitionPathStr, e);
}
}
@Override @Override
public Stream<HoodieDataFile> getLatestVersionInRange(FileStatus[] fileStatuses, public Stream<HoodieDataFile> getLatestVersionInRange(FileStatus[] fileStatuses,
List<String> commitsToReturn) { List<String> commitsToReturn) {
if (activeCommitTimeline.empty() || commitsToReturn.isEmpty()) { if (visibleActiveCommitTimeline.empty() || commitsToReturn.isEmpty()) {
return Stream.empty(); return Stream.empty();
} }
try { try {
return getFilesByFileId(fileStatuses, return getFilesByFileId(fileStatuses,
activeCommitTimeline.lastInstant().get().getTimestamp()) visibleActiveCommitTimeline.lastInstant().get().getTimestamp())
.map((Function<List<HoodieDataFile>, Optional<HoodieDataFile>>) fss -> { .map((Function<List<HoodieDataFile>, Optional<HoodieDataFile>>) fss -> {
for (HoodieDataFile fs : fss) { for (HoodieDataFile fs : fss) {
if (commitsToReturn.contains(fs.getCommitTime())) { if (commitsToReturn.contains(fs.getCommitTime())) {
@@ -127,14 +149,14 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView
public Stream<HoodieDataFile> getLatestVersionsBeforeOrOn(FileStatus[] fileStatuses, public Stream<HoodieDataFile> getLatestVersionsBeforeOrOn(FileStatus[] fileStatuses,
String maxCommitToReturn) { String maxCommitToReturn) {
try { try {
if (activeCommitTimeline.empty()) { if (visibleActiveCommitTimeline.empty()) {
return Stream.empty(); return Stream.empty();
} }
return getFilesByFileId(fileStatuses, return getFilesByFileId(fileStatuses,
activeCommitTimeline.lastInstant().get().getTimestamp()) visibleActiveCommitTimeline.lastInstant().get().getTimestamp())
.map((Function<List<HoodieDataFile>, Optional<HoodieDataFile>>) fss -> { .map((Function<List<HoodieDataFile>, Optional<HoodieDataFile>>) fss -> {
for (HoodieDataFile fs1 : fss) { for (HoodieDataFile fs1 : fss) {
if (activeCommitTimeline if (visibleActiveCommitTimeline
.compareTimestamps(fs1.getCommitTime(), maxCommitToReturn, .compareTimestamps(fs1.getCommitTime(), maxCommitToReturn,
HoodieTimeline.LESSER_OR_EQUAL)) { HoodieTimeline.LESSER_OR_EQUAL)) {
return Optional.of(fs1); return Optional.of(fs1);
@@ -150,11 +172,11 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView
@Override @Override
public Stream<HoodieDataFile> getLatestVersions(FileStatus[] fileStatuses) { public Stream<HoodieDataFile> getLatestVersions(FileStatus[] fileStatuses) {
try { try {
if (activeCommitTimeline.empty()) { if (visibleActiveCommitTimeline.empty()) {
return Stream.empty(); return Stream.empty();
} }
return getFilesByFileId(fileStatuses, return getFilesByFileId(fileStatuses,
activeCommitTimeline.lastInstant().get().getTimestamp()) visibleActiveCommitTimeline.lastInstant().get().getTimestamp())
.map(statuses -> statuses.get(0)); .map(statuses -> statuses.get(0));
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIOException("Could not filter files for latest version ", e); throw new HoodieIOException("Could not filter files for latest version ", e);
@@ -178,8 +200,9 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView
String maxCommitTime) throws IOException { String maxCommitTime) throws IOException {
return Arrays.stream(files).flatMap(fileStatus -> { return Arrays.stream(files).flatMap(fileStatus -> {
HoodieDataFile dataFile = new HoodieDataFile(fileStatus); HoodieDataFile dataFile = new HoodieDataFile(fileStatus);
if (activeCommitTimeline.containsOrBeforeTimelineStarts(dataFile.getCommitTime()) if (visibleActiveCommitTimeline.containsOrBeforeTimelineStarts(dataFile.getCommitTime())
&& activeCommitTimeline.compareTimestamps(dataFile.getCommitTime(), maxCommitTime, && visibleActiveCommitTimeline
.compareTimestamps(dataFile.getCommitTime(), maxCommitTime,
HoodieTimeline.LESSER_OR_EQUAL)) { HoodieTimeline.LESSER_OR_EQUAL)) {
return Stream.of(Pair.of(dataFile.getFileId(), dataFile)); return Stream.of(Pair.of(dataFile.getFileId(), dataFile));
} }

View File

@@ -29,19 +29,9 @@ import java.io.IOException;
*/ */
public class ReadOptimizedTableView extends AbstractTableFileSystemView { public class ReadOptimizedTableView extends AbstractTableFileSystemView {
public ReadOptimizedTableView(FileSystem fs, HoodieTableMetaClient metaClient) { public ReadOptimizedTableView(FileSystem fs, HoodieTableMetaClient metaClient) {
super(fs, metaClient); // Get the active timeline and filter only completed commits
super(fs, metaClient,
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants());
} }
protected FileStatus[] listDataFilesInPartition(String partitionPathStr) {
Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr);
try {
return fs.listStatus(partitionPath, path -> path.getName()
.contains(metaClient.getTableConfig().getROFileFormat().getFileExtension()));
} catch (IOException e) {
throw new HoodieIOException(
"Failed to list data files in partition " + partitionPathStr, e);
}
}
} }

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.table.view;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Realtime Table View which includes both ROStorageformat files and RTStorageFormat files
*/
public class RealtimeTableView extends AbstractTableFileSystemView {
public RealtimeTableView(FileSystem fs, HoodieTableMetaClient metaClient) {
// For realtime table view, visibleActiveCommitTimeline is a merged timeline of all commits and compactions
super(fs, metaClient, metaClient.getActiveTimeline().getTimelineOfActions(
Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.COMPACTION_ACTION)).filterCompletedInstants());
Preconditions.checkArgument(metaClient.getTableType() == HoodieTableType.MERGE_ON_READ,
"Realtime view can only be constructed on Hoodie tables with MERGE_ON_READ storage type");
}
public Map<HoodieDataFile, List<HoodieLogFile>> groupLatestDataFileWithLogFiles(FileSystem fs,
String partitionPath) throws IOException {
// All the files in the partition
FileStatus[] files = fs.listStatus(new Path(metaClient.getBasePath(), partitionPath));
// All the log files filtered from the above list, sorted by version numbers
List<HoodieLogFile> allLogFiles = Arrays.stream(files).filter(s -> s.getPath().getName()
.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()))
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(Collectors.toList(),
l -> l.stream().sorted(HoodieLogFile.getLogVersionComparator())
.collect(Collectors.toList())));
// Filter the delta files by the commit time of the latest base fine and collect as a list
Optional<HoodieInstant> lastTimestamp = metaClient.getActiveTimeline().lastInstant();
if(!lastTimestamp.isPresent()) {
return Maps.newHashMap();
}
return getLatestVersionInPartition(partitionPath, lastTimestamp.get().getTimestamp()).map(
hoodieDataFile -> Pair.of(hoodieDataFile, allLogFiles.stream().filter(
s -> s.getFileId().equals(hoodieDataFile.getFileId()) && s.getBaseCommitTime()
.equals(hoodieDataFile.getCommitTime())).collect(Collectors.toList()))).collect(
Collectors.toMap(
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, HoodieDataFile>) Pair::getKey,
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, List<HoodieLogFile>>) Pair::getRight));
}
}

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.util;
import com.google.common.collect.Lists;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.exception.HoodieIOException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.FileReader;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.hadoop.fs.AvroFSInput;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.List;
public class AvroUtils {
public static List<HoodieRecord<HoodieAvroPayload>> loadFromFiles(FileSystem fs,
List<String> deltaFilePaths, Schema expectedSchema) {
List<HoodieRecord<HoodieAvroPayload>> loadedRecords = Lists.newArrayList();
deltaFilePaths.forEach(s -> {
Path path = new Path(s);
try {
SeekableInput input =
new AvroFSInput(FileContext.getFileContext(fs.getConf()), path);
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>();
// Set the expected schema to be the current schema to account for schema evolution
reader.setExpected(expectedSchema);
FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
for (GenericRecord deltaRecord : fileReader) {
String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath =
deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath),
new HoodieAvroPayload(deltaRecord)));
}
fileReader.close(); // also closes underlying FsInput
} catch (IOException e) {
throw new HoodieIOException("Could not read avro records from path " + s, e);
}
});
return loadedRecords;
}
}

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.common.util; package com.uber.hoodie.common.util;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFile; import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.InvalidHoodiePathException; import com.uber.hoodie.exception.InvalidHoodiePathException;
@@ -45,8 +46,8 @@ import java.util.stream.Stream;
public class FSUtils { public class FSUtils {
private static final Logger LOG = LogManager.getLogger(FSUtils.class); private static final Logger LOG = LogManager.getLogger(FSUtils.class);
// Log files are of this pattern - b5068208-e1a4-11e6-bf01-fe55135034f3.avro.delta.1 // Log files are of this pattern - b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.avro.delta.1
private static final Pattern LOG_FILE_PATTERN = Pattern.compile("(.*)\\.(.*)\\.(.*)\\.([0-9]*)"); private static final Pattern LOG_FILE_PATTERN = Pattern.compile("(.*)_(.*)\\.(.*)\\.(.*)\\.([0-9]*)");
private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10; private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10;
public static FileSystem getFs() { public static FileSystem getFs() {
@@ -140,7 +141,7 @@ public class FSUtils {
if(!matcher.find()) { if(!matcher.find()) {
throw new InvalidHoodiePathException(logPath, "LogFile"); throw new InvalidHoodiePathException(logPath, "LogFile");
} }
return matcher.group(2) + "." + matcher.group(3); return matcher.group(3) + "." + matcher.group(4);
} }
/** /**
@@ -158,6 +159,21 @@ public class FSUtils {
return matcher.group(1); return matcher.group(1);
} }
/**
* Get the first part of the file name in the log file. That will be the fileId.
* Log file do not have commitTime in the file name.
*
* @param path
* @return
*/
public static String getBaseCommitTimeFromLogPath(Path path) {
Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
if(!matcher.find()) {
throw new InvalidHoodiePathException(path, "LogFile");
}
return matcher.group(2);
}
/** /**
* Get the last part of the file name in the log file and convert to int. * Get the last part of the file name in the log file and convert to int.
* *
@@ -169,11 +185,12 @@ public class FSUtils {
if(!matcher.find()) { if(!matcher.find()) {
throw new InvalidHoodiePathException(logPath, "LogFile"); throw new InvalidHoodiePathException(logPath, "LogFile");
} }
return Integer.parseInt(matcher.group(4)); return Integer.parseInt(matcher.group(5));
} }
public static String makeLogFileName(String fileId, String logFileExtension, int version) { public static String makeLogFileName(String fileId, String logFileExtension,
return String.format("%s%s.%d", fileId, logFileExtension, version); String baseCommitTime, int version) {
return String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version);
} }
/** /**
@@ -198,10 +215,10 @@ public class FSUtils {
* @return * @return
*/ */
public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath, public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath,
final String fileId, final String logFileExtension) throws IOException { final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
return Arrays.stream(fs.listStatus(partitionPath, return Arrays.stream(fs.listStatus(partitionPath,
path -> path.getName().startsWith(fileId) && path.getName() path -> path.getName().startsWith(fileId) && path.getName().contains(logFileExtension)))
.contains(logFileExtension))).map(HoodieLogFile::new); .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
} }
/** /**
@@ -215,9 +232,9 @@ public class FSUtils {
* @throws IOException * @throws IOException
*/ */
public static Optional<Integer> getLatestLogVersion(FileSystem fs, Path partitionPath, public static Optional<Integer> getLatestLogVersion(FileSystem fs, Path partitionPath,
final String fileId, final String logFileExtension) throws IOException { final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
Optional<HoodieLogFile> latestLogFile = Optional<HoodieLogFile> latestLogFile =
getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension)); getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime));
if (latestLogFile.isPresent()) { if (latestLogFile.isPresent()) {
return Optional.of(latestLogFile.get().getLogVersion()); return Optional.of(latestLogFile.get().getLogVersion());
} }
@@ -225,9 +242,9 @@ public class FSUtils {
} }
public static int getCurrentLogVersion(FileSystem fs, Path partitionPath, public static int getCurrentLogVersion(FileSystem fs, Path partitionPath,
final String fileId, final String logFileExtension) throws IOException { final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
Optional<Integer> currentVersion = Optional<Integer> currentVersion =
getLatestLogVersion(fs, partitionPath, fileId, logFileExtension); getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
// handle potential overflow // handle potential overflow
return (currentVersion.isPresent()) ? currentVersion.get() : 1; return (currentVersion.isPresent()) ? currentVersion.get() : 1;
} }
@@ -242,9 +259,9 @@ public class FSUtils {
* @throws IOException * @throws IOException
*/ */
public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId, public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId,
final String logFileExtension) throws IOException { final String logFileExtension, final String baseCommitTime) throws IOException {
Optional<Integer> currentVersion = Optional<Integer> currentVersion =
getLatestLogVersion(fs, partitionPath, fileId, logFileExtension); getLatestLogVersion(fs, partitionPath, fileId, logFileExtension, baseCommitTime);
// handle potential overflow // handle potential overflow
return (currentVersion.isPresent()) ? currentVersion.get() + 1 : 1; return (currentVersion.isPresent()) ? currentVersion.get() + 1 : 1;
} }
@@ -287,4 +304,5 @@ public class FSUtils {
return recovered; return recovered;
} }
} }

View File

@@ -18,6 +18,7 @@ package com.uber.hoodie.common.util;
import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.exception.SchemaCompatabilityException;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.*; import org.apache.avro.generic.*;
import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.BinaryEncoder;
@@ -124,14 +125,13 @@ public class HoodieAvroUtils {
/** /**
* Given a avro record with a given schema, rewrites it into the new schema * Given a avro record with a given schema, rewrites it into the new schema
*/ */
public static GenericRecord rewriteRecord(GenericRecord record, Schema newSchema) public static GenericRecord rewriteRecord(GenericRecord record, Schema newSchema) {
throws Exception {
GenericRecord newRecord = new GenericData.Record(newSchema); GenericRecord newRecord = new GenericData.Record(newSchema);
for (Schema.Field f : record.getSchema().getFields()) { for (Schema.Field f : record.getSchema().getFields()) {
newRecord.put(f.name(), record.get(f.name())); newRecord.put(f.name(), record.get(f.name()));
} }
if (!new GenericData().validate(newSchema, newRecord)) { if (!new GenericData().validate(newSchema, newRecord)) {
throw new Exception( throw new SchemaCompatabilityException(
"Unable to validate the rewritten record " + record + " against schema " "Unable to validate the rewritten record " + record + " against schema "
+ newSchema); + newSchema);
} }

View File

@@ -16,6 +16,8 @@
package com.uber.hoodie.exception; package com.uber.hoodie.exception;
import java.io.Serializable;
/** /**
* <p> * <p>
* Exception thrown for Hoodie failures. The root of * Exception thrown for Hoodie failures. The root of
@@ -27,7 +29,7 @@ package com.uber.hoodie.exception;
* </p> * </p>
* *
*/ */
public class HoodieException extends RuntimeException { public class HoodieException extends RuntimeException implements Serializable {
public HoodieException() { public HoodieException() {
super(); super();
} }

View File

@@ -0,0 +1,31 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.exception;
public class SchemaCompatabilityException extends HoodieException {
public SchemaCompatabilityException(String message) {
super(message);
}
public SchemaCompatabilityException(String message, Throwable t) {
super(message, t);
}
public SchemaCompatabilityException(Throwable t) {
super(t);
}
}

View File

@@ -20,12 +20,25 @@ import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output; import com.esotericsoftware.kryo.io.Output;
import com.esotericsoftware.kryo.serializers.JavaSerializer; import com.esotericsoftware.kryo.serializers.JavaSerializer;
import com.google.common.collect.Maps;
import com.uber.hoodie.common.table.HoodieTableConfig; import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogAppendConfig;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.log.avro.AvroLogAppender;
import com.uber.hoodie.common.table.log.avro.RollingAvroLogAppender;
import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.common.util.SchemaTestUtil;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.jute.Index;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
@@ -38,11 +51,16 @@ import java.io.Serializable;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.UUID; import java.util.UUID;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
public class HoodieTestUtils { public class HoodieTestUtils {
public static FileSystem fs = FSUtils.getFs(); public static FileSystem fs = FSUtils.getFs();
@@ -51,8 +69,13 @@ public class HoodieTestUtils {
public static final int DEFAULT_TASK_PARTITIONID = 1; public static final int DEFAULT_TASK_PARTITIONID = 1;
public static HoodieTableMetaClient init(String basePath) throws IOException { public static HoodieTableMetaClient init(String basePath) throws IOException {
return initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
}
public static HoodieTableMetaClient initTableType(String basePath, HoodieTableType tableType) throws IOException {
Properties properties = new Properties(); Properties properties = new Properties();
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME); properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME);
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties); return HoodieTableMetaClient.initializePathAsHoodieDataset(fs, basePath, properties);
} }
@@ -143,4 +166,41 @@ public class HoodieTestUtils {
input.close(); input.close();
return deseralizedObject; return deseralizedObject;
} }
public static void writeRecordsToLogFiles(String basePath, Schema schema, List<HoodieRecord> updatedRecords) {
Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream()
.collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation));
groupedUpdated.entrySet().forEach(s -> {
HoodieRecordLocation location = s.getKey();
String partitionPath = s.getValue().get(0).getPartitionPath();
HoodieLogAppendConfig logConfig = null;
try {
logConfig = HoodieLogAppendConfig.newBuilder()
.onPartitionPath(new Path(basePath, partitionPath))
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION)
.withFileId(location.getFileId())
.withBaseCommitTime(location.getCommitTime())
.withSchema(schema).withFs(fs).build();
AvroLogAppender log = new AvroLogAppender(logConfig);
log.append(s.getValue().stream().map(r -> {
try {
GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema);
HoodieAvroUtils.addHoodieKeyToRecord(val,
r.getRecordKey(),
r.getPartitionPath(),
"");
return val;
} catch (IOException e) {
return null;
}
}).collect(Collectors.toList()));
log.close();
} catch (Exception e) {
fail(e.toString());
}
});
}
} }

View File

@@ -82,6 +82,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig); RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
logAppender.append(SchemaTestUtil.generateTestRecords(0, 100)); logAppender.append(SchemaTestUtil.generateTestRecords(0, 100));
@@ -119,6 +120,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig); RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
logAppender.append(SchemaTestUtil.generateTestRecords(0, 100)); logAppender.append(SchemaTestUtil.generateTestRecords(0, 100));
@@ -139,6 +141,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig); RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
logAppender.append(SchemaTestUtil.generateTestRecords(0, 100)); logAppender.append(SchemaTestUtil.generateTestRecords(0, 100));
@@ -166,6 +169,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig); RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
long size1 = logAppender.getCurrentSize(); long size1 = logAppender.getCurrentSize();
@@ -188,6 +192,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig); RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
logAppender.append(SchemaTestUtil.generateTestRecords(0, 100)); logAppender.append(SchemaTestUtil.generateTestRecords(0, 100));
@@ -233,6 +238,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig); RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
long size1 = logAppender.getCurrentSize(); long size1 = logAppender.getCurrentSize();
@@ -272,6 +278,7 @@ public class AvroLogAppenderTest {
HoodieLogAppendConfig logConfig = HoodieLogAppendConfig logConfig =
HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500).withFs(fs) .withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500).withFs(fs)
.build(); .build();
@@ -284,6 +291,7 @@ public class AvroLogAppenderTest {
// Need to rebuild config to set the latest version as path // Need to rebuild config to set the latest version as path
logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500).withFs(fs).build();
logAppender = new RollingAvroLogAppender(logConfig); logAppender = new RollingAvroLogAppender(logConfig);
long size2 = logAppender.getCurrentSize(); long size2 = logAppender.getCurrentSize();
@@ -293,18 +301,21 @@ public class AvroLogAppenderTest {
logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath) logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
.withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
.withBaseCommitTime("100")
.withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500).withFs(fs).build(); .withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500).withFs(fs).build();
List<HoodieLogFile> allLogFiles = FSUtils List<HoodieLogFile> allLogFiles = FSUtils
.getAllLogFiles(fs, partitionPath, logConfig.getLogFile().getFileId(), .getAllLogFiles(fs, partitionPath, logConfig.getLogFile().getFileId(),
HoodieLogFile.DELTA_EXTENSION).collect(Collectors.toList()); HoodieLogFile.DELTA_EXTENSION, logConfig.getLogFile().getBaseCommitTime())
.collect(Collectors.toList());
assertEquals("", 2, allLogFiles.size()); assertEquals("", 2, allLogFiles.size());
SortedMap<Integer, List<Long>> offsets = Maps.newTreeMap(); SortedMap<Integer, List<Long>> offsets = Maps.newTreeMap();
offsets.put(1, Lists.newArrayList(size1)); offsets.put(1, Lists.newArrayList(size1));
offsets.put(2, Lists.newArrayList(size2)); offsets.put(2, Lists.newArrayList(size2));
CompositeAvroLogReader reader = CompositeAvroLogReader reader =
new CompositeAvroLogReader(partitionPath, logConfig.getLogFile().getFileId(), fs, new CompositeAvroLogReader(partitionPath, logConfig.getLogFile().getFileId(),
logConfig.getSchema(), HoodieLogFile.DELTA_EXTENSION); logConfig.getLogFile().getBaseCommitTime(), fs, logConfig.getSchema(),
HoodieLogFile.DELTA_EXTENSION);
Iterator<GenericRecord> results = reader.readBlocks(offsets); Iterator<GenericRecord> results = reader.readBlocks(offsets);
List<GenericRecord> totalBatch = IteratorUtils.toList(results); List<GenericRecord> totalBatch = IteratorUtils.toList(results);
assertEquals("Stream collect should return all 200 records", 200, totalBatch.size()); assertEquals("Stream collect should return all 200 records", 200, totalBatch.size());