1
0

CodeStyle formatting to conform to basic Checkstyle rules.

The code-style rules follow google style with some changes:

1. Increase line length from 100 to 120
2. Disable JavaDoc related checkstyles as this needs more manual work.

Both source and test code are checked for code-style
This commit is contained in:
Balaji Varadarajan
2018-03-20 16:29:20 -07:00
committed by vinoth chandar
parent 987f5d6b96
commit 788e4f2d2e
200 changed files with 6209 additions and 5975 deletions

View File

@@ -75,23 +75,425 @@ import scala.Tuple2;
/**
* Implementation of a very heavily read-optimized Hoodie Table where
*
* <p>
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing
* file, to expand it
*
* <p>
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
*/
public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieTable<T> {
private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
public HoodieCopyOnWriteTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
super(config, metaClient);
}
private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class);
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
PartitionCleanStat> deleteFilesFunc(
HoodieTable table) {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
iter -> {
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
FileSystem fs = table.getMetaClient().getFs();
while (iter.hasNext()) {
Tuple2<String, String> partitionDelFileTuple = iter.next();
String partitionPath = partitionDelFileTuple._1();
String deletePathStr = partitionDelFileTuple._2();
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
if (!partitionCleanStatMap.containsKey(partitionPath)) {
partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
}
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
}
return partitionCleanStatMap.entrySet().stream()
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.collect(Collectors.toList()).iterator();
};
}
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(HoodieTable table,
HoodieWriteConfig config) {
return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
return cleaner.getDeletePaths(partitionPathToClean).stream()
.map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString())).iterator();
};
}
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
throws IOException {
Path deletePath = new Path(deletePathStr);
logger.debug("Working on delete path :" + deletePath);
boolean deleteResult = fs.delete(deletePath, false);
if (deleteResult) {
logger.debug("Cleaned file at path :" + deletePath);
}
return deleteResult;
}
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
if (profile == null) {
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
}
return new UpsertPartitioner(profile);
}
@Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile);
}
@Override
public boolean isWorkloadProfileNeeded() {
return true;
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords) throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle,
String commitTime, String fileLoc) throws IOException {
if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException(
"Error in finding the old file path at commit " + commitTime + " at fileLoc: " + fileLoc);
} else {
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
ParquetReader<IndexedRecord> reader = AvroParquetReader.builder(upsertHandle.getOldFilePath())
.withConf(getHadoopConf()).build();
try {
IndexedRecord record;
while ((record = reader.read()) != null) {
// Two types of writes here (new record, and old record).
// We have already catch the exception during writing new records.
// But for old records, we should fail if any exception happens.
upsertHandle.write((GenericRecord) record);
}
} catch (IOException e) {
throw new HoodieUpsertException(
"Failed to read record from " + upsertHandle.getOldFilePath() + " with new Schema "
+ upsertHandle.getSchema(), e);
} finally {
reader.close();
upsertHandle.close();
}
}
//TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
+ upsertHandle.getWriteStatus());
}
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords) {
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
}
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
}
@SuppressWarnings("unchecked")
@Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
BucketType btype = binfo.bucketType;
try {
if (btype.equals(BucketType.INSERT)) {
return handleInsert(commitTime, recordItr);
} else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
} else {
throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
}
} catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
logger.error(msg, t);
throw new HoodieUpsertException(msg, t);
}
}
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
}
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
@Override
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try {
FileSystem fs = getMetaClient().getFs();
List<String> partitionsToClean = FSUtils
.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
if (partitionsToClean.isEmpty()) {
logger.info("Nothing to clean here mom. It is already clean");
return Collections.emptyList();
}
return cleanPartitionPaths(partitionsToClean, jsc);
} catch (IOException e) {
throw new HoodieIOException("Failed to clean up after commit", e);
}
}
/**
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs();
FileStatus[] toBeDeleted = fs
.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
if (!path.toString().contains(".parquet")) {
return false;
}
String fileCommitTime = FSUtils.getCommitTime(path.getName());
return commits.contains(fileCommitTime);
});
Map<FileStatus, Boolean> results = Maps.newHashMap();
for (FileStatus file : toBeDeleted) {
boolean success = fs.delete(file.getPath(), false);
results.put(file, success);
logger.info("Delete file " + file.getPath() + "\t" + success);
}
return results;
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
String actionType = this.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
// Atomically unpublish all the commits
commits.stream().filter(s -> !inflights.contains(s))
.map(s -> new HoodieInstant(false, actionType, s))
.forEach(activeTimeline::revertToInflight);
logger.info("Unpublished " + commits);
// delete all the data files for all these commits
logger.info("Clean out all parquet files generated for commits: " + commits);
List<HoodieRollbackStat> stats = jsc.parallelize(FSUtils
.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build();
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
// Remove the rolled back inflight commits
commits.stream().map(s -> new HoodieInstant(true, actionType, s))
.forEach(activeTimeline::deleteInflight);
logger.info("Deleted inflight commits " + commits);
return stats;
}
/**
* Finalize the written data files
*
* @param writeStatuses List of WriteStatus
* @return number of files finalized
*/
@Override
@SuppressWarnings("unchecked")
public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return Optional.empty();
}
// This is to rename each data file from temporary path to its final location
List<Tuple2<String, Boolean>> results = jsc
.parallelize(writeStatuses, config.getFinalizeWriteParallelism()).map(writeStatus -> {
Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>)
writeStatus;
HoodieWriteStat writeStat = writeStatTuple2._2();
final FileSystem fs = getMetaClient().getFs();
final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
if (writeStat.getTempPath() != null) {
final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
boolean success;
try {
logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
success = fs.rename(tempPath, finalPath);
} catch (IOException e) {
throw new HoodieIOException(
"Failed to rename file: " + tempPath + " to " + finalPath);
}
if (!success) {
throw new HoodieIOException(
"Failed to rename file: " + tempPath + " to " + finalPath);
}
}
return new Tuple2<>(writeStat.getPath(), true);
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
return Optional.of(results.size());
}
/**
* Clean temporary data files that are produced from previous failed commit or retried spark
* stages.
*/
private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return;
}
final FileSystem fs = getMetaClient().getFs();
final Path temporaryFolder = new Path(config.getBasePath(),
HoodieTableMetaClient.TEMPFOLDER_NAME);
try {
if (!fs.exists(temporaryFolder)) {
logger.info("Temporary folder does not exist: " + temporaryFolder);
return;
}
List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
List<Tuple2<String, Boolean>> results = jsc
.parallelize(fileStatusesList, config.getFinalizeWriteParallelism()).map(fileStatus -> {
FileSystem fs1 = getMetaClient().getFs();
boolean success = fs1.delete(fileStatus.getPath(), false);
logger
.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t" + success);
return new Tuple2<>(fileStatus.getPath().toString(), success);
}).collect();
for (Tuple2<String, Boolean> result : results) {
if (!result._2()) {
logger.info("Failed to delete file: " + result._1());
throw new HoodieIOException("Failed to delete file in temporary folder: " + result._1());
}
}
} catch (IOException e) {
throw new HoodieIOException(
"Failed to clean data files in temporary folder: " + temporaryFolder);
}
}
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
.parallelize(partitionsToClean, cleanerParallelism)
.flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2)).collect();
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
.collect(Collectors.toMap(e -> e._1(), e -> e._2()));
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
// Return PartitionCleanStat for each partition passed.
return partitionsToClean.stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat =
(partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
.get(partitionPath) : new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
.withPartitionPath(partitionPath)
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
.withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
}).collect(Collectors.toList());
}
enum BucketType {
UPDATE,
INSERT
UPDATE, INSERT
}
private static class PartitionCleanStat implements Serializable {
private final String partitionPath;
private final List<String> deletePathPatterns = new ArrayList<>();
private final List<String> successDeleteFiles = new ArrayList<>();
private final List<String> failedDeleteFiles = new ArrayList<>();
private PartitionCleanStat(String partitionPath) {
this.partitionPath = partitionPath;
}
private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
if (deletedFileResult) {
successDeleteFiles.add(deletePathStr);
} else {
failedDeleteFiles.add(deletePathStr);
}
}
private void addDeleteFilePatterns(String deletePathStr) {
deletePathPatterns.add(deletePathStr);
}
private PartitionCleanStat merge(PartitionCleanStat other) {
if (!this.partitionPath.equals(other.partitionPath)) {
throw new RuntimeException(String
.format("partitionPath is not a match: (%s, %s)", partitionPath, other.partitionPath));
}
successDeleteFiles.addAll(other.successDeleteFiles);
deletePathPatterns.addAll(other.deletePathPatterns);
failedDeleteFiles.addAll(other.failedDeleteFiles);
return this;
}
}
/**
@@ -150,45 +552,37 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
}
/**
* Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition)
*/
class UpsertPartitioner extends Partitioner {
/**
* List of all small files to be corrected
*/
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
/**
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming
* workload into
*/
private int totalBuckets = 0;
/**
* Stat for the current workload. Helps in determining total inserts, upserts etc.
*/
private WorkloadStat globalStat;
/**
* Helps decide which bucket an incoming update should go to.
*/
private HashMap<String, Integer> updateLocationToBucket;
/**
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
*/
private HashMap<String, List<InsertBucket>> partitionPathToInsertBuckets;
/**
* Remembers what type each bucket is for later.
*/
private HashMap<Integer, BucketInfo> bucketInfoMap;
/**
* List of all small files to be corrected
*/
List<SmallFile> smallFiles = new ArrayList<SmallFile>();
UpsertPartitioner(WorkloadProfile profile) {
updateLocationToBucket = new HashMap<>();
partitionPathToInsertBuckets = new HashMap<>();
@@ -198,16 +592,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
assignUpdates(profile);
assignInserts(profile);
logger.info("Total Buckets :" + totalBuckets + ", " +
"buckets info => " + bucketInfoMap + ", \n" +
"Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" +
"UpdateLocations mapped to buckets =>" + updateLocationToBucket);
logger.info(
"Total Buckets :" + totalBuckets + ", " + "buckets info => " + bucketInfoMap + ", \n"
+ "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n"
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
}
private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition
WorkloadStat gStat = profile.getGlobalStat();
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) {
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : gStat.getUpdateLocationToCount()
.entrySet()) {
addUpdateBucket(updateLocEntry.getKey());
}
}
@@ -270,10 +665,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L);
logger
.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets
+ ", recordsPerBucket => " + insertRecordsPerBucket);
logger.info(
"After small file assignment: unassignedInserts => " + totalUnassignedInserts
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => "
+ insertRecordsPerBucket);
for (int b = 0; b < insertBuckets; b++) {
bucketNumbers.add(totalBuckets);
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
@@ -339,8 +734,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
*/
private long averageBytesPerRecord() {
long avgSize = 0L;
HoodieTimeline commitTimeline =
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getCommitTimeline()
.filterCompletedInstants();
try {
if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
@@ -372,7 +767,8 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public int getPartition(Object key) {
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey,
Option<HoodieRecordLocation>>) key;
if (keyLocation._2().isDefined()) {
HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId());
@@ -396,420 +792,4 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
}
}
}
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
if (profile == null) {
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
}
return new UpsertPartitioner(profile);
}
@Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile);
}
@Override
public boolean isWorkloadProfileNeeded() {
return true;
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr)
throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, recordItr);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords)
throws IOException {
// these are updates
HoodieMergeHandle upsertHandle = getUpdateHandle(commitTime, fileLoc, keyToNewRecords);
return handleUpdateInternal(upsertHandle, commitTime, fileLoc);
}
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String commitTime, String fileLoc)
throws IOException {
if (upsertHandle.getOldFilePath() == null) {
throw new HoodieUpsertException("Error in finding the old file path at commit " +
commitTime + " at fileLoc: " + fileLoc);
} else {
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
ParquetReader<IndexedRecord> reader =
AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf())
.build();
try {
IndexedRecord record;
while ((record = reader.read()) != null) {
// Two types of writes here (new record, and old record).
// We have already catch the exception during writing new records.
// But for old records, we should fail if any exception happens.
upsertHandle.write((GenericRecord) record);
}
} catch (IOException e) {
throw new HoodieUpsertException(
"Failed to read record from " + upsertHandle.getOldFilePath()
+ " with new Schema " + upsertHandle.getSchema(), e);
} finally {
reader.close();
upsertHandle.close();
}
}
//TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
logger.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath()
+ ", " + upsertHandle.getWriteStatus());
}
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus()))
.iterator();
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) {
return new HoodieMergeHandle<>(config, commitTime, this, recordItr, fileLoc);
}
protected HoodieMergeHandle getUpdateHandle(String commitTime, String fileLoc,
Map<String, HoodieRecord<T>> keyToNewRecords) {
return new HoodieMergeHandle<>(config, commitTime, this, keyToNewRecords, fileLoc);
}
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
Iterator<HoodieRecord<T>> recordItr) throws Exception {
return new LazyInsertIterable<>(recordItr, config, commitTime, this);
}
@SuppressWarnings("unchecked")
@Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
Iterator recordItr, Partitioner partitioner) {
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
BucketType btype = binfo.bucketType;
try {
if (btype.equals(BucketType.INSERT)) {
return handleInsert(commitTime, recordItr);
} else if (btype.equals(BucketType.UPDATE)) {
return handleUpdate(commitTime, binfo.fileLoc, recordItr);
} else {
throw new HoodieUpsertException(
"Unknown bucketType " + btype + " for partition :" + partition);
}
} catch (Throwable t) {
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
logger.error(msg, t);
throw new HoodieUpsertException(msg, t);
}
}
@Override
public Iterator<List<WriteStatus>> handleInsertPartition(String commitTime, Integer partition,
Iterator recordItr,
Partitioner partitioner) {
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
}
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of
* files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
* task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
@Override
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
try {
FileSystem fs = getMetaClient().getFs();
List<String> partitionsToClean =
FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning());
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
.getCleanerPolicy());
if (partitionsToClean.isEmpty()) {
logger.info("Nothing to clean here mom. It is already clean");
return Collections.emptyList();
}
return cleanPartitionPaths(partitionsToClean, jsc);
} catch (IOException e) {
throw new HoodieIOException("Failed to clean up after commit", e);
}
}
/**
* Common method used for cleaning out parquet files under a partition path during rollback of a
* set of commits
*/
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
throws IOException {
logger.info("Cleaning path " + partitionPath);
FileSystem fs = getMetaClient().getFs();
FileStatus[] toBeDeleted =
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
if (!path.toString().contains(".parquet")) {
return false;
}
String fileCommitTime = FSUtils.getCommitTime(path.getName());
return commits.contains(fileCommitTime);
});
Map<FileStatus, Boolean> results = Maps.newHashMap();
for (FileStatus file : toBeDeleted) {
boolean success = fs.delete(file.getPath(), false);
results.put(file, success);
logger.info("Delete file " + file.getPath() + "\t" + success);
}
return results;
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {
String actionType = this.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
.map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
// Atomically unpublish all the commits
commits.stream().filter(s -> !inflights.contains(s))
.map(s -> new HoodieInstant(false, actionType, s))
.forEach(activeTimeline::revertToInflight);
logger.info("Unpublished " + commits);
// delete all the data files for all these commits
logger.info("Clean out all parquet files generated for commits: " + commits);
List<HoodieRollbackStat> stats = jsc.parallelize(
FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
// Scan all partitions files with this commit time
Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build();
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
// Remove the rolled back inflight commits
commits.stream().map(s -> new HoodieInstant(true, actionType, s))
.forEach(activeTimeline::deleteInflight);
logger.info("Deleted inflight commits " + commits);
return stats;
}
/**
* Finalize the written data files
*
* @param writeStatuses List of WriteStatus
* @return number of files finalized
*/
@Override
@SuppressWarnings("unchecked")
public Optional<Integer> finalizeWrite(JavaSparkContext jsc, List writeStatuses) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return Optional.empty();
}
// This is to rename each data file from temporary path to its final location
List<Tuple2<String, Boolean>> results = jsc.parallelize(writeStatuses, config.getFinalizeWriteParallelism())
.map(writeStatus -> {
Tuple2<String, HoodieWriteStat> writeStatTuple2 = (Tuple2<String, HoodieWriteStat>) writeStatus;
HoodieWriteStat writeStat = writeStatTuple2._2();
final FileSystem fs = getMetaClient().getFs();
final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
if (writeStat.getTempPath() != null) {
final Path tempPath = new Path(config.getBasePath(), writeStat.getTempPath());
boolean success;
try {
logger.info("Renaming temporary file: " + tempPath + " to " + finalPath);
success = fs.rename(tempPath, finalPath);
} catch (IOException e) {
throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
}
if (!success) {
throw new HoodieIOException("Failed to rename file: " + tempPath + " to " + finalPath);
}
}
return new Tuple2<>(writeStat.getPath(), true);
}).collect();
// clean temporary data files
cleanTemporaryDataFiles(jsc);
return Optional.of(results.size());
}
/**
* Clean temporary data files that are produced from previous failed commit or retried spark
* stages.
*/
private void cleanTemporaryDataFiles(JavaSparkContext jsc) {
if (!config.shouldUseTempFolderForCopyOnWrite()) {
return;
}
final FileSystem fs = getMetaClient().getFs();
final Path temporaryFolder = new Path(config.getBasePath(),
HoodieTableMetaClient.TEMPFOLDER_NAME);
try {
if (!fs.exists(temporaryFolder)) {
logger.info("Temporary folder does not exist: " + temporaryFolder);
return;
}
List<FileStatus> fileStatusesList = Arrays.asList(fs.listStatus(temporaryFolder));
List<Tuple2<String, Boolean>> results = jsc
.parallelize(fileStatusesList, config.getFinalizeWriteParallelism())
.map(fileStatus -> {
FileSystem fs1 = getMetaClient().getFs();
boolean success = fs1.delete(fileStatus.getPath(), false);
logger.info("Deleting file in temporary folder" + fileStatus.getPath() + "\t"
+ success);
return new Tuple2<>(fileStatus.getPath().toString(), success);
}).collect();
for (Tuple2<String, Boolean> result : results) {
if (!result._2()) {
logger.info("Failed to delete file: " + result._1());
throw new HoodieIOException(
"Failed to delete file in temporary folder: " + result._1());
}
}
} catch (IOException e) {
throw new HoodieIOException(
"Failed to clean data files in temporary folder: " + temporaryFolder);
}
}
private static class PartitionCleanStat implements Serializable {
private final String partitionPath;
private final List<String> deletePathPatterns = new ArrayList<>();
private final List<String> successDeleteFiles = new ArrayList<>();
private final List<String> failedDeleteFiles = new ArrayList<>();
private PartitionCleanStat(String partitionPath) {
this.partitionPath = partitionPath;
}
private void addDeletedFileResult(String deletePathStr, Boolean deletedFileResult) {
if (deletedFileResult) {
successDeleteFiles.add(deletePathStr);
} else {
failedDeleteFiles.add(deletePathStr);
}
}
private void addDeleteFilePatterns(String deletePathStr) {
deletePathPatterns.add(deletePathStr);
}
private PartitionCleanStat merge(PartitionCleanStat other) {
if (!this.partitionPath.equals(other.partitionPath)) {
throw new RuntimeException(String.format(
"partitionPath is not a match: (%s, %s)",
partitionPath, other.partitionPath));
}
successDeleteFiles.addAll(other.successDeleteFiles);
deletePathPatterns.addAll(other.deletePathPatterns);
failedDeleteFiles.addAll(other.failedDeleteFiles);
return this;
}
}
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
JavaSparkContext jsc) {
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
logger.info("Using cleanerParallelism: " + cleanerParallelism);
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
.parallelize(partitionsToClean, cleanerParallelism)
.flatMapToPair(getFilesToDeleteFunc(this, config))
.repartition(cleanerParallelism) // repartition to remove skews
.mapPartitionsToPair(deleteFilesFunc(this))
.reduceByKey(
// merge partition level clean stats below
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
.merge(e2))
.collect();
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats
.stream().collect(Collectors.toMap(e -> e._1(), e -> e._2()));
HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
// Return PartitionCleanStat for each partition passed.
return partitionsToClean.stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat =
(partitionCleanStatsMap.containsKey(partitionPath)) ?
partitionCleanStatsMap.get(partitionPath)
: new PartitionCleanStat(partitionPath);
return HoodieCleanStat.newBuilder()
.withPolicy(config.getCleanerPolicy())
.withPartitionPath(partitionPath)
.withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
.withDeletePathPattern(partitionCleanStat.deletePathPatterns)
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
.withFailedDeletes(partitionCleanStat.failedDeleteFiles)
.build();
}).collect(Collectors.toList());
}
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
HoodieTable table) {
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
FileSystem fs = table.getMetaClient().getFs();
while (iter.hasNext()) {
Tuple2<String, String> partitionDelFileTuple = iter.next();
String partitionPath = partitionDelFileTuple._1();
String deletePathStr = partitionDelFileTuple._2();
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
if (!partitionCleanStatMap.containsKey(partitionPath)) {
partitionCleanStatMap.put(partitionPath,
new PartitionCleanStat(partitionPath));
}
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
partitionCleanStat.addDeleteFilePatterns(deletePathStr);
partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
}
return partitionCleanStatMap.entrySet().stream()
.map(e -> new Tuple2<>(e.getKey(), e.getValue()))
.collect(Collectors.toList()).iterator();
};
}
private static PairFlatMapFunction<String, String, String> getFilesToDeleteFunc(
HoodieTable table, HoodieWriteConfig config) {
return (PairFlatMapFunction<String, String, String>) partitionPathToClean -> {
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
return cleaner.getDeletePaths(partitionPathToClean).stream()
.map(deleteFile -> new Tuple2<>(partitionPathToClean, deleteFile.toString()))
.iterator();
};
}
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
throws IOException {
Path deletePath = new Path(deletePathStr);
logger.debug("Working on delete path :" + deletePath);
boolean deleteResult = fs.delete(deletePath, false);
if (deleteResult) {
logger.debug("Cleaned file at path :" + deletePath);
}
return deleteResult;
}
}

View File

@@ -62,19 +62,12 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
/**
* Implementation of a more real-time read-optimized Hoodie Table where
* <p>
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or)
* Merge with the smallest existing file, to expand it
* </p>
* <p>
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the
* log file into the base file.
* </p>
* <p>
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an
* attempted commit action
* </p>
* Implementation of a more real-time read-optimized Hoodie Table where <p> INSERTS - Same as
* HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
* smallest existing file, to expand it </p> <p> UPDATES - Appends the changes to a rolling log file
* maintained per file Id. Compaction merges the log file into the base file. </p> <p> WARNING - MOR
* table type does not support nested rollbacks, every rollback must be followed by an attempted
* commit action </p>
*/
public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
HoodieCopyOnWriteTable<T> {
@@ -88,57 +81,6 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
super(config, metaClient);
}
/**
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
* files to larger ones without the need for an index in the logFile.
*/
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
super(profile);
}
@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
// Init here since this class (and member variables) might not have been initialized
HoodieTimeline commitTimeline = getCompletedCommitTimeline();
if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
// find smallest file in partition and append to it
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp())
.filter(fileSlice -> fileSlice.getLogFiles().count() < 1 &&
fileSlice.getDataFile().get().getFileSize() < config.getParquetSmallFileLimit())
.sorted((FileSlice left, FileSlice right) ->
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize() ? -1 : 1)
.findFirst();
if(smallFileSlice.isPresent()) {
String filename = smallFileSlice.get().getDataFile().get().getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
FSUtils.getFileId(filename));
sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
smallFileLocations.add(sf);
// Update the global small files list
smallFiles.add(sf);
}
}
return smallFileLocations;
}
public List<String> getSmallFileIds() {
return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.collect(Collectors.toList());
}
}
@Override
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
if (profile == null) {
@@ -150,15 +92,16 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
@Override
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
Iterator<HoodieRecord<T>> recordItr) throws IOException {
Iterator<HoodieRecord<T>> recordItr) throws IOException {
logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
if(mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
logger.info("Small file corrections for updates for commit " + commitTime + " for file " + fileId);
if (mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
logger.info(
"Small file corrections for updates for commit " + commitTime + " for file " + fileId);
return super.handleUpdate(commitTime, fileId, recordItr);
} else {
HoodieAppendHandle<T> appendHandle =
new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr);
HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
fileId, recordItr);
appendHandle.doAppend();
appendHandle.close();
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
@@ -202,13 +145,11 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
if (commits.size() > 1) {
throw new UnsupportedOperationException("Nested Rollbacks are not supported");
}
Map<String, HoodieInstant> commitsAndCompactions =
this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.DELTA_COMMIT_ACTION))
.getInstants()
.filter(i -> commits.contains(i.getTimestamp()))
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
Map<String, HoodieInstant> commitsAndCompactions = this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.DELTA_COMMIT_ACTION)).getInstants()
.filter(i -> commits.contains(i.getTimestamp()))
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
// Atomically un-publish all non-inflight commits
commitsAndCompactions.entrySet().stream().map(entry -> entry.getValue())
@@ -218,9 +159,9 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
Long startTime = System.currentTimeMillis();
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize
(FSUtils.getAllPartitionPaths(this.metaClient.getFs(),
this.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
List<HoodieRollbackStat> allRollbackStats = jsc.parallelize(FSUtils
.getAllPartitionPaths(this.metaClient.getFs(), this.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.map((Function<String, List<HoodieRollbackStat>>) partitionPath -> {
return commits.stream().map(commit -> {
HoodieInstant instant = commitsAndCompactions.get(commit);
@@ -228,56 +169,63 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
switch (instant.getAction()) {
case HoodieTimeline.COMMIT_ACTION:
try {
Map<FileStatus, Boolean> results = super.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
.withDeletedFileResults(results).build();
Map<FileStatus, Boolean> results = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath).withDeletedFileResults(results).build();
break;
} catch (IOException io) {
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
}
case HoodieTimeline.DELTA_COMMIT_ACTION:
try {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(this.getCommitTimeline().getInstantDetails(new HoodieInstant(true, instant.getAction(), instant.getTimestamp())).get());
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
this.getCommitTimeline().getInstantDetails(
new HoodieInstant(true, instant.getAction(), instant.getTimestamp()))
.get());
// read commit file and (either append delete blocks or delete file)
Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
Map<FileStatus, Long> filesToNumBlocksRollback = new HashMap<>();
// we do not know fileIds for inserts (first inserts are parquet files), delete all parquet files for the corresponding failed commit, if present (same as COW)
// we do not know fileIds for inserts (first inserts are parquet files), delete
// all parquet files for the corresponding failed commit, if present (same as COW)
filesToDeletedStatus = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));
// append rollback blocks for updates
if(commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
commitMetadata.getPartitionToWriteStats().get(partitionPath).stream()
.filter(wStat -> {
return wStat != null && wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
return wStat != null
&& wStat.getPrevCommit() != HoodieWriteStat.NULL_COMMIT
&& wStat.getPrevCommit() != null;
})
.forEach(wStat -> {
}).forEach(wStat -> {
HoodieLogFormat.Writer writer = null;
try {
writer = HoodieLogFormat.newWriterBuilder()
.onParentPath(new Path(this.getMetaClient().getBasePath(), partitionPath))
writer = HoodieLogFormat.newWriterBuilder().onParentPath(
new Path(this.getMetaClient().getBasePath(), partitionPath))
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
.withFs(this.metaClient.getFs())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
Long numRollbackBlocks = 0L;
// generate metadata
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
Map<HoodieLogBlock.HeaderMetadataType, String> header =
Maps.newHashMap();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME,
metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, commit);
header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE,
String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME,
commit);
header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String
.valueOf(
HoodieCommandBlock.HoodieCommandBlockTypeEnum
.ROLLBACK_PREVIOUS_BLOCK
.ordinal()));
// if update belongs to an existing log file
writer = writer.appendBlock(new HoodieCommandBlock(
header));
writer = writer.appendBlock(new HoodieCommandBlock(header));
numRollbackBlocks++;
filesToNumBlocksRollback
.put(this.getMetaClient().getFs().getFileStatus(writer.getLogFile().getPath()),
numRollbackBlocks);
filesToNumBlocksRollback.put(this.getMetaClient().getFs()
.getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks);
} catch (IOException | InterruptedException io) {
throw new HoodieRollbackException(
"Failed to rollback for commit " + commit, io);
@@ -289,7 +237,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
}
});
hoodieRollbackStats = HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
hoodieRollbackStats = HoodieRollbackStat.newBuilder()
.withPartitionPath(partitionPath)
.withDeletedFileResults(filesToDeletedStatus)
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build();
}
@@ -297,17 +246,19 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
} catch (IOException io) {
throw new UncheckedIOException("Failed to rollback for commit " + commit, io);
}
default:
break;
}
return hoodieRollbackStats;
}).collect(Collectors.toList());
}).flatMap(x -> x.iterator()).filter(x -> x != null).collect();
commitsAndCompactions.entrySet().stream()
.map(entry -> new HoodieInstant(true, entry.getValue().getAction(),
entry.getValue().getTimestamp()))
.forEach(this.getActiveTimeline()::deleteInflight);
commitsAndCompactions.entrySet().stream().map(
entry -> new HoodieInstant(true, entry.getValue().getAction(),
entry.getValue().getTimestamp())).forEach(this.getActiveTimeline()::deleteInflight);
logger.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
logger
.debug("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime));
return allRollbackStats;
}
@@ -317,4 +268,56 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
// do nothing for MOR tables
return Optional.empty();
}
/**
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet
* files to larger ones without the need for an index in the logFile.
*/
class MergeOnReadUpsertPartitioner extends HoodieCopyOnWriteTable.UpsertPartitioner {
MergeOnReadUpsertPartitioner(WorkloadProfile profile) {
super(profile);
}
@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
// Init here since this class (and member variables) might not have been initialized
HoodieTimeline commitTimeline = getCompletedCommitTimeline();
if (!commitTimeline.empty()) {
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
// find smallest file in partition and append to it
Optional<FileSlice> smallFileSlice = getRTFileSystemView()
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).filter(
fileSlice -> fileSlice.getLogFiles().count() < 1
&& fileSlice.getDataFile().get().getFileSize() < config
.getParquetSmallFileLimit()).sorted((FileSlice left, FileSlice right) ->
left.getDataFile().get().getFileSize() < right.getDataFile().get().getFileSize()
? -1 : 1).findFirst();
if (smallFileSlice.isPresent()) {
String filename = smallFileSlice.get().getDataFile().get().getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename),
FSUtils.getFileId(filename));
sf.sizeBytes = smallFileSlice.get().getDataFile().get().getFileSize();
smallFileLocations.add(sf);
// Update the global small files list
smallFiles.add(sf);
}
}
return smallFileLocations;
}
public List<String> getSmallFileIds() {
return (List<String>) smallFiles.stream()
.map(smallFile -> ((SmallFile) smallFile).location.getFileId())
.collect(Collectors.toList());
}
}
}

View File

@@ -60,18 +60,28 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
this.metaClient = metaClient;
}
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
switch (metaClient.getTableType()) {
case COPY_ON_WRITE:
return new HoodieCopyOnWriteTable<>(config, metaClient);
case MERGE_ON_READ:
return new HoodieMergeOnReadTable<>(config, metaClient);
default:
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
}
/**
* Provides a partitioner to perform the upsert operation, based on the workload profile
*/
public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile);
/**
* Provides a partitioner to perform the insert operation, based on the workload profile
*/
public abstract Partitioner getInsertPartitioner(WorkloadProfile profile);
/**
* Return whether this HoodieTable implementation can benefit from workload profiling
*/
@@ -131,7 +141,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return getCommitsTimeline().filterInflights();
}
/**
* Get only the completed (no-inflights) clean timeline
*/
@@ -162,12 +171,12 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
throw new HoodieSavepointException(
"Could not get data files for savepoint " + savepointTime + ". No such savepoint.");
}
HoodieInstant instant =
new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION,
savepointTime);
HoodieSavepointMetadata metadata = null;
try {
metadata = AvroUtils.deserializeHoodieSavepointMetadata(
getActiveTimeline().getInstantDetails(instant).get());
metadata = AvroUtils
.deserializeHoodieSavepointMetadata(getActiveTimeline().getInstantDetails(instant).get());
} catch (IOException e) {
throw new HoodieSavepointException(
"Could not get savepointed data files for savepoint " + savepointTime, e);
@@ -189,7 +198,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return getActiveTimeline().getCommitTimeline();
case MERGE_ON_READ:
// We need to include the parquet files written out in delta commits
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
// Include commit action to be able to start doing a MOR over a COW dataset - no
// migration required
return getActiveTimeline().getCommitsTimeline();
default:
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
@@ -219,9 +229,10 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
return HoodieActiveTimeline.COMMIT_ACTION;
case MERGE_ON_READ:
return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
default:
throw new HoodieCommitException(
"Could not commit on unknown storage type " + metaClient.getTableType());
}
throw new HoodieCommitException(
"Could not commit on unknown storage type " + metaClient.getTableType());
}
/**
@@ -236,21 +247,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable(
HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
switch (metaClient.getTableType()) {
case COPY_ON_WRITE:
return new HoodieCopyOnWriteTable<>(config, metaClient);
case MERGE_ON_READ:
return new HoodieMergeOnReadTable<>(config, metaClient);
default:
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
}
/**
* Run Compaction on the table.
* Compaction arranges the data so that it is optimized for data access
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* access
*/
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);

View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecord;

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
@@ -31,7 +30,7 @@ import scala.Tuple2;
/**
* Information about incoming records for upsert/insert obtained either via sampling or
* introspecting the data fully
*
* <p>
* TODO(vc): Think about obtaining this directly from index.tagLocation
*/
public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializable {
@@ -60,11 +59,9 @@ public class WorkloadProfile<T extends HoodieRecordPayload> implements Serializa
private void buildProfile() {
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
.mapToPair(record ->
new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
record))
.countByKey();
.mapToPair(record -> new Tuple2<>(
new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
record)).countByKey();
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
.entrySet()) {

View File

@@ -17,10 +17,9 @@
package com.uber.hoodie.table;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import org.apache.commons.lang3.tuple.Pair;
import java.io.Serializable;
import java.util.HashMap;
import org.apache.commons.lang3.tuple.Pair;
/**
* Wraps stats about a single partition path.