1
0

(1) Define CompactionWorkload in avro to allow storing them in instant files.

(2) Split APIs in HoodieRealtimeCompactor to separate generating compaction workload from running compaction
This commit is contained in:
Balaji Varadarajan
2018-05-23 20:49:24 -07:00
committed by vinoth chandar
parent 6d01ae8ca0
commit 1b61f04e05
16 changed files with 444 additions and 129 deletions

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.table;
import com.google.common.collect.Maps;
import com.google.common.hash.Hashing;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.common.HoodieCleanStat;
import com.uber.hoodie.common.HoodieRollbackStat;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
@@ -161,11 +162,22 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
return true;
}
@Override
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
HoodieCompactionPlan compactionPlan) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
Iterator<HoodieRecord<T>> recordItr) throws IOException {
// these are updates

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.table;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.common.HoodieRollbackStat;
import com.uber.hoodie.common.model.FileSlice;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
@@ -126,7 +127,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionCommitTime) {
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
Optional<HoodieInstant> lastCompaction = getActiveTimeline().getCommitTimeline()
.filterCompletedInstants().lastInstant();
@@ -141,10 +142,20 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
+ " delta commits was found since last compaction " + deltaCommitsSinceTs
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax());
return jsc.emptyRDD();
return new HoodieCompactionPlan();
}
logger.info("Compacting merge on read table " + config.getBasePath());
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
try {
return compactor.generateCompactionPlan(jsc, this, config, instantTime);
} catch (IOException e) {
throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e);
}
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionCommitTime) {
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
try {
return compactor.compact(jsc, config, this, compactionCommitTime);
@@ -153,6 +164,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
HoodieCompactionPlan compactionPlan) {
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
try {
return compactor.compact(jsc, compactionPlan, this, config, compactionInstantTime);
} catch (IOException e) {
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
}
}
@Override
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
throws IOException {

View File

@@ -17,6 +17,7 @@
package com.uber.hoodie.table;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
import com.uber.hoodie.common.HoodieCleanStat;
import com.uber.hoodie.common.HoodieRollbackStat;
@@ -211,12 +212,32 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
/**
* Schedule compaction for the instant time
* @param jsc Spark Context
* @param instantTime Instant Time for scheduling compaction
* @return
*/
public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime);
/**
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* access
* @deprecated Will be replaced with newer APIs
*/
@Deprecated
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);
/**
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data access
*
* @param jsc Spark Context
* @param compactionInstantTime Instant Time
* @param compactionPlan Compaction Plan
*/
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
HoodieCompactionPlan compactionPlan);
/**
* Clean partition paths according to cleaning policy and returns the number of files cleaned.
*/