Fixes needed to run merge-on-read testing on production scale data
This commit is contained in:
committed by
prazanna
parent
57ab7a2405
commit
aee136777b
@@ -16,6 +16,7 @@
|
||||
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
@@ -32,6 +33,7 @@ import com.uber.hoodie.exception.HoodieUpsertException;
|
||||
import com.uber.hoodie.func.LazyInsertIterable;
|
||||
import com.uber.hoodie.io.HoodieUpdateHandle;
|
||||
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@@ -55,6 +57,7 @@ import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -470,4 +473,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
Partitioner partitioner) {
|
||||
return handleUpsertPartition(commitTime, partition, recordItr, partitioner);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc) {
|
||||
logger.info("Nothing to compact in COW storage format");
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,11 +17,17 @@
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.io.HoodieAppendHandle;
|
||||
import com.uber.hoodie.io.compact.CompactionFilter;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import java.util.Optional;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -29,6 +35,7 @@ import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Implementation of a more real-time read-optimized Hoodie Table where
|
||||
@@ -59,4 +66,33 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
|
||||
.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc) {
|
||||
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
|
||||
Optional<HoodieInstant> lastCompaction = getActiveTimeline().getCompactionTimeline()
|
||||
.filterCompletedInstants().lastInstant();
|
||||
String deltaCommitsSinceTs = "0";
|
||||
if (lastCompaction.isPresent()) {
|
||||
deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
|
||||
}
|
||||
|
||||
int deltaCommitsSinceLastCompaction = getActiveTimeline().getDeltaCommitTimeline()
|
||||
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
|
||||
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
|
||||
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
|
||||
+ " delta commits was found since last compaction " + deltaCommitsSinceTs
|
||||
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
logger.info("Compacting merge on read table " + config.getBasePath());
|
||||
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||
try {
|
||||
return Optional.of(compactor.compact(jsc, config, this, CompactionFilter.allowAll()));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ package com.uber.hoodie.table;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
|
||||
import com.uber.hoodie.avro.model.HoodieSavepointPartitionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
@@ -34,6 +34,7 @@ import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
|
||||
import com.uber.hoodie.exception.HoodieSavepointException;
|
||||
import java.util.Optional;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.Partitioner;
|
||||
|
||||
@@ -41,9 +42,9 @@ import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Abstract implementation of a HoodieTable
|
||||
@@ -195,8 +196,9 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
return getActiveTimeline().getCommitTimeline();
|
||||
case MERGE_ON_READ:
|
||||
// We need to include the parquet files written out in delta commits
|
||||
// Include commit action to be able to start doing a MOR over a COW dataset - no migration required
|
||||
return getActiveTimeline().getTimelineOfActions(
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMPACTION_ACTION,
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION,
|
||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION));
|
||||
default:
|
||||
throw new HoodieException("Unsupported table type :"+ metaClient.getTableType());
|
||||
@@ -293,4 +295,10 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run Compaction on the table.
|
||||
* Compaction arranges the data so that it is optimized for data access
|
||||
*/
|
||||
public abstract Optional<HoodieCompactionMetadata> compact(JavaSparkContext jsc);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user