Introduce HoodieReadHandle abstraction into index
- Generalized BloomIndex to work with file ids instead of paths - Abstracted away Bloom filter checking into HoodieLookupHandle - Abstracted away range information retrieval into HoodieRangeInfoHandle
This commit is contained in:
committed by
vinoth chandar
parent
51d122b5c3
commit
b791473a6d
@@ -20,6 +20,7 @@ package com.uber.hoodie;
|
|||||||
|
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||||
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
@@ -119,15 +120,27 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Optional<String> convertToDataFilePath(Optional<Pair<String, String>> partitionPathFileIDPair) {
|
||||||
|
if (partitionPathFileIDPair.isPresent()) {
|
||||||
|
HoodieDataFile dataFile = hoodieTable.getROFileSystemView()
|
||||||
|
.getLatestDataFile(partitionPathFileIDPair.get().getLeft(), partitionPathFileIDPair.get().getRight()).get();
|
||||||
|
return Optional.of(dataFile.getPath());
|
||||||
|
} else {
|
||||||
|
return Optional.absent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a bunch of hoodie keys, fetches all the individual records out as a data frame
|
* Given a bunch of hoodie keys, fetches all the individual records out as a data frame
|
||||||
*
|
*
|
||||||
* @return a dataframe
|
* @return a dataframe
|
||||||
*/
|
*/
|
||||||
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception {
|
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
|
||||||
assertSqlContext();
|
assertSqlContext();
|
||||||
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index
|
JavaPairRDD<HoodieKey, Optional<Pair<String, String>>> lookupResultRDD = index
|
||||||
.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
|
.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
|
||||||
|
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = lookupResultRDD
|
||||||
|
.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
|
||||||
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
||||||
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
|
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
|
||||||
|
|
||||||
@@ -144,7 +157,6 @@ public class HoodieReadClient<T extends HoodieRecordPayload> extends AbstractHoo
|
|||||||
|
|
||||||
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
||||||
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
|
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
|
||||||
|
|
||||||
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ import com.uber.hoodie.common.util.queue.BoundedInMemoryQueueConsumer;
|
|||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||||
import com.uber.hoodie.io.HoodieIOHandle;
|
import com.uber.hoodie.io.HoodieWriteHandle;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@@ -131,12 +131,11 @@ public class CopyOnWriteLazyInsertIterable<T extends HoodieRecordPayload> extend
|
|||||||
BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
|
BoundedInMemoryQueueConsumer<HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> {
|
||||||
|
|
||||||
protected final List<WriteStatus> statuses = new ArrayList<>();
|
protected final List<WriteStatus> statuses = new ArrayList<>();
|
||||||
protected HoodieIOHandle handle;
|
protected HoodieWriteHandle handle;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void consumeOneRecord(HoodieInsertValueGenResult<HoodieRecord> payload) {
|
protected void consumeOneRecord(HoodieInsertValueGenResult<HoodieRecord> payload) {
|
||||||
final HoodieRecord insertPayload = payload.record;
|
final HoodieRecord insertPayload = payload.record;
|
||||||
|
|
||||||
// lazily initialize the handle, for the first time
|
// lazily initialize the handle, for the first time
|
||||||
if (handle == null) {
|
if (handle == null) {
|
||||||
handle = new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, insertPayload.getPartitionPath(),
|
handle = new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, insertPayload.getPartitionPath(),
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import com.uber.hoodie.WriteStatus;
|
|||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||||
@@ -63,12 +64,10 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[partitionPath, fileID]]
|
||||||
* If the optional FullFilePath value is not present, then the key is not found. If the
|
* If the optional is empty, then the key is not found.
|
||||||
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
|
||||||
* file
|
|
||||||
*/
|
*/
|
||||||
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
|
public abstract JavaPairRDD<HoodieKey, Optional<Pair<String, String>>> fetchRecordLocation(
|
||||||
JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable);
|
JavaRDD<HoodieKey> hoodieKeys, final JavaSparkContext jsc, HoodieTable<T> hoodieTable);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import com.uber.hoodie.common.model.HoodieKey;
|
|||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -55,7 +56,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
public JavaPairRDD<HoodieKey, Optional<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
||||||
throw new UnsupportedOperationException("InMemory index does not implement check exist yet");
|
throw new UnsupportedOperationException("InMemory index does not implement check exist yet");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,30 +22,30 @@ import com.google.common.base.Objects;
|
|||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Metadata about a given file, useful for index lookup
|
* Metadata about a given file group, useful for index lookup
|
||||||
*/
|
*/
|
||||||
public class BloomIndexFileInfo implements Serializable {
|
public class BloomIndexFileInfo implements Serializable {
|
||||||
|
|
||||||
private final String fileName;
|
private final String fileId;
|
||||||
|
|
||||||
private final String minRecordKey;
|
private final String minRecordKey;
|
||||||
|
|
||||||
private final String maxRecordKey;
|
private final String maxRecordKey;
|
||||||
|
|
||||||
public BloomIndexFileInfo(String fileName, String minRecordKey, String maxRecordKey) {
|
public BloomIndexFileInfo(String fileId, String minRecordKey, String maxRecordKey) {
|
||||||
this.fileName = fileName;
|
this.fileId = fileId;
|
||||||
this.minRecordKey = minRecordKey;
|
this.minRecordKey = minRecordKey;
|
||||||
this.maxRecordKey = maxRecordKey;
|
this.maxRecordKey = maxRecordKey;
|
||||||
}
|
}
|
||||||
|
|
||||||
public BloomIndexFileInfo(String fileName) {
|
public BloomIndexFileInfo(String fileId) {
|
||||||
this.fileName = fileName;
|
this.fileId = fileId;
|
||||||
this.minRecordKey = null;
|
this.minRecordKey = null;
|
||||||
this.maxRecordKey = null;
|
this.maxRecordKey = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFileName() {
|
public String getFileId() {
|
||||||
return fileName;
|
return fileId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMinRecordKey() {
|
public String getMinRecordKey() {
|
||||||
@@ -77,19 +77,19 @@ public class BloomIndexFileInfo implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
BloomIndexFileInfo that = (BloomIndexFileInfo) o;
|
||||||
return Objects.equal(that.fileName, fileName) && Objects.equal(that.minRecordKey, minRecordKey)
|
return Objects.equal(that.fileId, fileId) && Objects.equal(that.minRecordKey, minRecordKey)
|
||||||
&& Objects.equal(that.maxRecordKey, maxRecordKey);
|
&& Objects.equal(that.maxRecordKey, maxRecordKey);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hashCode(fileName, minRecordKey, maxRecordKey);
|
return Objects.hashCode(fileId, minRecordKey, maxRecordKey);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
final StringBuilder sb = new StringBuilder("BloomIndexFileInfo {");
|
final StringBuilder sb = new StringBuilder("BloomIndexFileInfo {");
|
||||||
sb.append(" fileName=").append(fileName);
|
sb.append(" fileId=").append(fileId);
|
||||||
sb.append(" minRecordKey=").append(minRecordKey);
|
sb.append(" minRecordKey=").append(minRecordKey);
|
||||||
sb.append(" maxRecordKey=").append(maxRecordKey);
|
sb.append(" maxRecordKey=").append(maxRecordKey);
|
||||||
sb.append('}');
|
sb.append('}');
|
||||||
|
|||||||
@@ -25,26 +25,22 @@ import static java.util.stream.Collectors.toList;
|
|||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Optional;
|
import com.google.common.base.Optional;
|
||||||
import com.uber.hoodie.WriteStatus;
|
import com.uber.hoodie.WriteStatus;
|
||||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
|
||||||
import com.uber.hoodie.common.util.ParquetUtils;
|
|
||||||
import com.uber.hoodie.common.util.collection.Pair;
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.MetadataNotFoundException;
|
import com.uber.hoodie.exception.MetadataNotFoundException;
|
||||||
import com.uber.hoodie.index.HoodieIndex;
|
import com.uber.hoodie.index.HoodieIndex;
|
||||||
|
import com.uber.hoodie.io.HoodieRangeInfoHandle;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.Partitioner;
|
import org.apache.spark.Partitioner;
|
||||||
@@ -85,7 +81,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
||||||
|
|
||||||
// Lookup indexes for all the partition/recordkey pair
|
// Lookup indexes for all the partition/recordkey pair
|
||||||
JavaPairRDD<HoodieKey, String> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
|
||||||
|
hoodieTable);
|
||||||
|
|
||||||
// Cache the result, for subsequent stages.
|
// Cache the result, for subsequent stages.
|
||||||
if (config.getBloomIndexUseCaching()) {
|
if (config.getBloomIndexUseCaching()) {
|
||||||
@@ -109,27 +106,33 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
return taggedRecordRDD;
|
return taggedRecordRDD;
|
||||||
}
|
}
|
||||||
|
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
/**
|
||||||
|
* Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Optional.Empty if the key is
|
||||||
|
* not found.
|
||||||
|
*
|
||||||
|
* @param hoodieKeys keys to lookup
|
||||||
|
* @param jsc spark context
|
||||||
|
* @param hoodieTable hoodie table object
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public JavaPairRDD<HoodieKey, Optional<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
|
||||||
.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
|
.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
|
||||||
|
|
||||||
// Lookup indexes for all the partition/recordkey pair
|
// Lookup indexes for all the partition/recordkey pair
|
||||||
JavaPairRDD<HoodieKey, String> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> recordKeyLocationRDD = lookupIndex(partitionRecordKeyPairRDD, jsc,
|
||||||
|
hoodieTable);
|
||||||
JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
|
JavaPairRDD<HoodieKey, String> keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));
|
||||||
|
|
||||||
return keyHoodieKeyPairRDD.leftOuterJoin(keyFilenamePairRDD).mapToPair(keyPathTuple -> {
|
return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
|
||||||
Optional<String> recordLocationPath;
|
Optional<Pair<String, String>> partitionPathFileidPair;
|
||||||
if (keyPathTuple._2._2.isPresent()) {
|
if (keyLoc._2._2.isPresent()) {
|
||||||
String fileName = keyPathTuple._2._2.get();
|
partitionPathFileidPair = Optional.of(Pair.of(keyLoc._1().getPartitionPath(), keyLoc._2._2.get().getFileId()));
|
||||||
String partitionPath = keyPathTuple._1.getPartitionPath();
|
|
||||||
recordLocationPath = Optional
|
|
||||||
.of(new Path(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath), fileName)
|
|
||||||
.toUri().getPath());
|
|
||||||
} else {
|
} else {
|
||||||
recordLocationPath = Optional.absent();
|
partitionPathFileidPair = Optional.absent();
|
||||||
}
|
}
|
||||||
return new Tuple2<>(keyPathTuple._1, recordLocationPath);
|
return new Tuple2<>(keyLoc._1, partitionPathFileidPair);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,9 +140,9 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* Lookup the location for each record key and return the pair<record_key,location> for all record keys already
|
* Lookup the location for each record key and return the pair<record_key,location> for all record keys already
|
||||||
* present and drop the record keys if not present
|
* present and drop the record keys if not present
|
||||||
*/
|
*/
|
||||||
private JavaPairRDD<HoodieKey, String> lookupIndex(
|
private JavaPairRDD<HoodieKey, HoodieRecordLocation> lookupIndex(
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext jsc,
|
||||||
jsc, final HoodieTable hoodieTable) {
|
final HoodieTable hoodieTable) {
|
||||||
// Obtain records per partition, in the incoming records
|
// Obtain records per partition, in the incoming records
|
||||||
Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
|
Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
|
||||||
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
||||||
@@ -157,7 +160,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
|
int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
|
||||||
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
|
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
|
||||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism,
|
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism,
|
||||||
hoodieTable.getMetaClient(), comparisonsPerFileGroup);
|
hoodieTable, comparisonsPerFileGroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -178,7 +181,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
partitionToFileInfo.entrySet().stream().forEach(e -> {
|
partitionToFileInfo.entrySet().stream().forEach(e -> {
|
||||||
for (BloomIndexFileInfo fileInfo : e.getValue()) {
|
for (BloomIndexFileInfo fileInfo : e.getValue()) {
|
||||||
//each file needs to be compared against all the records coming into the partition
|
//each file needs to be compared against all the records coming into the partition
|
||||||
fileToComparisons.put(fileInfo.getFileName(), recordsPerPartition.get(e.getKey()));
|
fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(e.getKey()));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -227,35 +230,35 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
final HoodieTable hoodieTable) {
|
final HoodieTable hoodieTable) {
|
||||||
|
|
||||||
// Obtain the latest data files from all the partitions.
|
// Obtain the latest data files from all the partitions.
|
||||||
List<Tuple2<String, HoodieDataFile>> dataFilesList = jsc
|
List<Pair<String, String>> partitionPathFileIDList = jsc
|
||||||
.parallelize(partitions, Math.max(partitions.size(), 1)).flatMapToPair(partitionPath -> {
|
.parallelize(partitions, Math.max(partitions.size(), 1))
|
||||||
|
.flatMap(partitionPath -> {
|
||||||
java.util.Optional<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline()
|
java.util.Optional<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline()
|
||||||
.filterCompletedInstants().lastInstant();
|
.filterCompletedInstants().lastInstant();
|
||||||
List<Tuple2<String, HoodieDataFile>> filteredFiles = new ArrayList<>();
|
List<Pair<String, String>> filteredFiles = new ArrayList<>();
|
||||||
if (latestCommitTime.isPresent()) {
|
if (latestCommitTime.isPresent()) {
|
||||||
filteredFiles = hoodieTable.getROFileSystemView()
|
filteredFiles = hoodieTable.getROFileSystemView()
|
||||||
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp())
|
.getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp())
|
||||||
.map(f -> new Tuple2<>(partitionPath, f)).collect(toList());
|
.map(f -> Pair.of(partitionPath, f.getFileId())).collect(toList());
|
||||||
}
|
}
|
||||||
return filteredFiles.iterator();
|
return filteredFiles.iterator();
|
||||||
}).collect();
|
}).collect();
|
||||||
|
|
||||||
if (config.getBloomIndexPruneByRanges()) {
|
if (config.getBloomIndexPruneByRanges()) {
|
||||||
// also obtain file ranges, if range pruning is enabled
|
// also obtain file ranges, if range pruning is enabled
|
||||||
return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)).mapToPair(ft -> {
|
return jsc.parallelize(partitionPathFileIDList, Math.max(partitionPathFileIDList.size(), 1)).mapToPair(pf -> {
|
||||||
try {
|
try {
|
||||||
String[] minMaxKeys = ParquetUtils
|
HoodieRangeInfoHandle<T> rangeInfoHandle = new HoodieRangeInfoHandle<T>(config, hoodieTable, pf);
|
||||||
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), new Path(ft._2().getPath()));
|
String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
|
||||||
return new Tuple2<>(ft._1(),
|
return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
|
||||||
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
|
||||||
} catch (MetadataNotFoundException me) {
|
} catch (MetadataNotFoundException me) {
|
||||||
logger.warn("Unable to find range metadata in file :" + ft._2());
|
logger.warn("Unable to find range metadata in file :" + pf);
|
||||||
return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
|
return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
|
||||||
}
|
}
|
||||||
}).collect();
|
}).collect();
|
||||||
} else {
|
} else {
|
||||||
return dataFilesList.stream()
|
return partitionPathFileIDList.stream()
|
||||||
.map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())))
|
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue())))
|
||||||
.collect(toList());
|
.collect(toList());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -324,9 +327,9 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* parallelism for tagging location
|
* parallelism for tagging location
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
JavaPairRDD<HoodieKey, String> findMatchingFilesForRecordKeys(
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
|
||||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTableMetaClient metaClient,
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
|
||||||
Map<String, Long> fileGroupToComparisons) {
|
Map<String, Long> fileGroupToComparisons) {
|
||||||
JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
|
JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
|
||||||
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
||||||
@@ -347,17 +350,18 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
}
|
}
|
||||||
|
|
||||||
return fileComparisonsRDD
|
return fileComparisonsRDD
|
||||||
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(metaClient, config.getBasePath()), true)
|
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
|
||||||
.flatMap(List::iterator)
|
.flatMap(List::iterator)
|
||||||
.filter(lr -> lr.getMatchingRecordKeys().size() > 0)
|
.filter(lr -> lr.getMatchingRecordKeys().size() > 0)
|
||||||
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
|
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
|
||||||
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
|
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
|
||||||
lookupResult.getFileName()))
|
new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
|
||||||
.collect(Collectors.toList())
|
.collect(Collectors.toList())
|
||||||
.iterator());
|
.iterator());
|
||||||
}
|
}
|
||||||
|
|
||||||
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord, org.apache.spark.api.java.Optional<String> location) {
|
HoodieRecord<T> getTaggedRecord(HoodieRecord<T> inputRecord,
|
||||||
|
org.apache.spark.api.java.Optional<HoodieRecordLocation> location) {
|
||||||
HoodieRecord<T> record = inputRecord;
|
HoodieRecord<T> record = inputRecord;
|
||||||
if (location.isPresent()) {
|
if (location.isPresent()) {
|
||||||
// When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
|
// When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
|
||||||
@@ -366,11 +370,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
// currentLocation 2 times and it will fail the second time. So creating a new in memory
|
// currentLocation 2 times and it will fail the second time. So creating a new in memory
|
||||||
// copy of the hoodie record.
|
// copy of the hoodie record.
|
||||||
record = new HoodieRecord<>(inputRecord);
|
record = new HoodieRecord<>(inputRecord);
|
||||||
String filename = location.get();
|
record.setCurrentLocation(location.get());
|
||||||
if (filename != null && !filename.isEmpty()) {
|
|
||||||
record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename),
|
|
||||||
FSUtils.getFileId(filename)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
@@ -379,10 +379,9 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
||||||
*/
|
*/
|
||||||
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||||
JavaPairRDD<HoodieKey, String> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||||
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD = recordRDD
|
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD = recordRDD
|
||||||
.mapToPair(record -> new Tuple2<>(record.getKey(), record));
|
.mapToPair(record -> new Tuple2<>(record.getKey(), record));
|
||||||
|
|
||||||
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
||||||
// so we do left outer join.
|
// so we do left outer join.
|
||||||
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values().map(v1 -> getTaggedRecord(v1._1, v1._2));
|
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values().map(v1 -> getTaggedRecord(v1._1, v1._2));
|
||||||
|
|||||||
@@ -18,23 +18,18 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.index.bloom;
|
package com.uber.hoodie.index.bloom;
|
||||||
|
|
||||||
import com.uber.hoodie.common.BloomFilter;
|
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.common.util.HoodieTimer;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.common.util.ParquetUtils;
|
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
import com.uber.hoodie.exception.HoodieIndexException;
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
import com.uber.hoodie.func.LazyIterableIterator;
|
import com.uber.hoodie.func.LazyIterableIterator;
|
||||||
|
import com.uber.hoodie.io.HoodieKeyLookupHandle;
|
||||||
|
import com.uber.hoodie.io.HoodieKeyLookupHandle.KeyLookupResult;
|
||||||
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.api.java.function.Function2;
|
import org.apache.spark.api.java.function.Function2;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@@ -43,150 +38,69 @@ import scala.Tuple2;
|
|||||||
* actual files
|
* actual files
|
||||||
*/
|
*/
|
||||||
public class HoodieBloomIndexCheckFunction implements
|
public class HoodieBloomIndexCheckFunction implements
|
||||||
Function2<Integer, Iterator<Tuple2<String, HoodieKey>>,
|
Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
|
||||||
Iterator<List<KeyLookupResult>>> {
|
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
|
private final HoodieTable hoodieTable;
|
||||||
|
|
||||||
private final String basePath;
|
private final HoodieWriteConfig config;
|
||||||
|
|
||||||
private final HoodieTableMetaClient metaClient;
|
public HoodieBloomIndexCheckFunction(HoodieTable hoodieTable, HoodieWriteConfig config) {
|
||||||
|
this.hoodieTable = hoodieTable;
|
||||||
public HoodieBloomIndexCheckFunction(HoodieTableMetaClient metaClient, String basePath) {
|
this.config = config;
|
||||||
this.metaClient = metaClient;
|
|
||||||
this.basePath = basePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
|
||||||
*/
|
|
||||||
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
|
|
||||||
List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
|
|
||||||
List<String> foundRecordKeys = new ArrayList<>();
|
|
||||||
try {
|
|
||||||
// Load all rowKeys from the file, to double-confirm
|
|
||||||
if (!candidateRecordKeys.isEmpty()) {
|
|
||||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
|
||||||
Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
|
|
||||||
new HashSet<>(candidateRecordKeys));
|
|
||||||
foundRecordKeys.addAll(fileRowKeys);
|
|
||||||
logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
|
|
||||||
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
|
|
||||||
if (logger.isDebugEnabled()) {
|
|
||||||
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new HoodieIndexException("Error checking candidate keys against file.", e);
|
|
||||||
}
|
|
||||||
return foundRecordKeys;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<List<KeyLookupResult>> call(Integer partition,
|
public Iterator<List<KeyLookupResult>> call(Integer partition,
|
||||||
Iterator<Tuple2<String, HoodieKey>> fileParitionRecordKeyTripletItr)
|
Iterator<Tuple2<String, HoodieKey>> fileParitionRecordKeyTripletItr) {
|
||||||
throws Exception {
|
|
||||||
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
||||||
}
|
}
|
||||||
|
|
||||||
class LazyKeyCheckIterator extends
|
class LazyKeyCheckIterator extends LazyIterableIterator<Tuple2<String, HoodieKey>, List<KeyLookupResult>> {
|
||||||
LazyIterableIterator<Tuple2<String, HoodieKey>, List<KeyLookupResult>> {
|
|
||||||
|
|
||||||
private List<String> candidateRecordKeys;
|
private HoodieKeyLookupHandle keyLookupHandle;
|
||||||
|
|
||||||
private BloomFilter bloomFilter;
|
|
||||||
|
|
||||||
private String currentFile;
|
|
||||||
|
|
||||||
private String currentPartitionPath;
|
|
||||||
|
|
||||||
private long totalKeysChecked;
|
|
||||||
|
|
||||||
LazyKeyCheckIterator(
|
LazyKeyCheckIterator(
|
||||||
Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
|
Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
|
||||||
super(filePartitionRecordKeyTripletItr);
|
super(filePartitionRecordKeyTripletItr);
|
||||||
currentFile = null;
|
|
||||||
candidateRecordKeys = new ArrayList<>();
|
|
||||||
bloomFilter = null;
|
|
||||||
currentPartitionPath = null;
|
|
||||||
totalKeysChecked = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void start() {
|
protected void start() {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initState(String fileName, String partitionPath) throws HoodieIndexException {
|
|
||||||
try {
|
|
||||||
Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName);
|
|
||||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
|
||||||
bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(metaClient.getHadoopConf(), filePath);
|
|
||||||
logger.info(String.format("Read bloom filter from %s/%s in %d ms", partitionPath, fileName, timer.endTimer()));
|
|
||||||
candidateRecordKeys = new ArrayList<>();
|
|
||||||
currentFile = fileName;
|
|
||||||
currentPartitionPath = partitionPath;
|
|
||||||
totalKeysChecked = 0;
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new HoodieIndexException("Error checking candidate keys against file.", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// check record key against bloom filter of current file & add to possible keys if needed
|
|
||||||
private void checkAndAddCandidates(String recordKey) {
|
|
||||||
if (bloomFilter.mightContain(recordKey)) {
|
|
||||||
if (logger.isDebugEnabled()) {
|
|
||||||
logger.debug("Record key " + recordKey + " matches bloom filter in file " + currentPartitionPath
|
|
||||||
+ "/" + currentFile);
|
|
||||||
}
|
|
||||||
candidateRecordKeys.add(recordKey);
|
|
||||||
}
|
|
||||||
totalKeysChecked++;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> checkAgainstCurrentFile() {
|
|
||||||
Path filePath = new Path(basePath + "/" + currentPartitionPath + "/" + currentFile);
|
|
||||||
if (logger.isDebugEnabled()) {
|
|
||||||
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
|
||||||
}
|
|
||||||
List<String> matchingKeys = checkCandidatesAgainstFile(metaClient.getHadoopConf(), candidateRecordKeys, filePath);
|
|
||||||
logger.info(String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)",
|
|
||||||
totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(),
|
|
||||||
matchingKeys.size()));
|
|
||||||
return matchingKeys;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<KeyLookupResult> computeNext() {
|
protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() {
|
||||||
|
|
||||||
List<KeyLookupResult> ret = new ArrayList<>();
|
List<HoodieKeyLookupHandle.KeyLookupResult> ret = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
// process one file in each go.
|
// process one file in each go.
|
||||||
while (inputItr.hasNext()) {
|
while (inputItr.hasNext()) {
|
||||||
Tuple2<String, HoodieKey> currentTuple = inputItr.next();
|
Tuple2<String, HoodieKey> currentTuple = inputItr.next();
|
||||||
String fileName = currentTuple._1;
|
String fileId = currentTuple._1;
|
||||||
String partitionPath = currentTuple._2.getPartitionPath();
|
String partitionPath = currentTuple._2.getPartitionPath();
|
||||||
String recordKey = currentTuple._2.getRecordKey();
|
String recordKey = currentTuple._2.getRecordKey();
|
||||||
|
Pair<String, String> partitionPathFilePair = Pair.of(partitionPath, fileId);
|
||||||
|
|
||||||
// lazily init state
|
// lazily init state
|
||||||
if (currentFile == null) {
|
if (keyLookupHandle == null) {
|
||||||
initState(fileName, partitionPath);
|
keyLookupHandle = new HoodieKeyLookupHandle(config, hoodieTable, partitionPathFilePair);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if continue on current file
|
// if continue on current file
|
||||||
if (fileName.equals(currentFile)) {
|
if (keyLookupHandle.getPartitionPathFilePair().equals(partitionPathFilePair)) {
|
||||||
checkAndAddCandidates(recordKey);
|
keyLookupHandle.addKey(recordKey);
|
||||||
} else {
|
} else {
|
||||||
// do the actual checking of file & break out
|
// do the actual checking of file & break out
|
||||||
ret.add(new KeyLookupResult(currentFile, currentPartitionPath, checkAgainstCurrentFile()));
|
ret.add(keyLookupHandle.getLookupResult());
|
||||||
initState(fileName, partitionPath);
|
keyLookupHandle = new HoodieKeyLookupHandle(config, hoodieTable, partitionPathFilePair);
|
||||||
checkAndAddCandidates(recordKey);
|
keyLookupHandle.addKey(recordKey);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// handle case, where we ran out of input, close pending work, update return val
|
// handle case, where we ran out of input, close pending work, update return val
|
||||||
if (!inputItr.hasNext()) {
|
if (!inputItr.hasNext()) {
|
||||||
ret.add(new KeyLookupResult(currentFile, currentPartitionPath, checkAgainstCurrentFile()));
|
ret.add(keyLookupHandle.getLookupResult());
|
||||||
}
|
}
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
if (e instanceof HoodieException) {
|
if (e instanceof HoodieException) {
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ package com.uber.hoodie.index.bloom;
|
|||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.uber.hoodie.common.model.HoodieKey;
|
import com.uber.hoodie.common.model.HoodieKey;
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
@@ -83,7 +84,7 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||||
Map<String, String> indexToPartitionMap = new HashMap<>();
|
Map<String, String> indexToPartitionMap = new HashMap<>();
|
||||||
for (Entry<String, List<BloomIndexFileInfo>> entry : partitionToFileIndexInfo.entrySet()) {
|
for (Entry<String, List<BloomIndexFileInfo>> entry : partitionToFileIndexInfo.entrySet()) {
|
||||||
entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileName(), entry.getKey()));
|
entry.getValue().forEach(indexFile -> indexToPartitionMap.put(indexFile.getFileId(), entry.getKey()));
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges()
|
IndexFileFilter indexFileFilter = config.getBloomIndexPruneByRanges()
|
||||||
@@ -106,7 +107,7 @@ public class HoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||||
JavaPairRDD<HoodieKey, String> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||||
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
|
||||||
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
||||||
|
|
||||||
|
|||||||
@@ -50,9 +50,9 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
|
|||||||
allIndexFiles.forEach(indexFile -> {
|
allIndexFiles.forEach(indexFile -> {
|
||||||
if (indexFile.hasKeyRanges()) {
|
if (indexFile.hasKeyRanges()) {
|
||||||
indexLookUpTree.insert(new KeyRangeNode(indexFile.getMinRecordKey(),
|
indexLookUpTree.insert(new KeyRangeNode(indexFile.getMinRecordKey(),
|
||||||
indexFile.getMaxRecordKey(), indexFile.getFileName()));
|
indexFile.getMaxRecordKey(), indexFile.getFileId()));
|
||||||
} else {
|
} else {
|
||||||
filesWithNoRanges.add(indexFile.getFileName());
|
filesWithNoRanges.add(indexFile.getFileId());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,12 +49,12 @@ class IntervalTreeBasedIndexFileFilter implements IndexFileFilter {
|
|||||||
bloomIndexFiles.forEach(indexFileInfo -> {
|
bloomIndexFiles.forEach(indexFileInfo -> {
|
||||||
if (indexFileInfo.hasKeyRanges()) {
|
if (indexFileInfo.hasKeyRanges()) {
|
||||||
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(),
|
lookUpTree.insert(new KeyRangeNode(indexFileInfo.getMinRecordKey(),
|
||||||
indexFileInfo.getMaxRecordKey(), indexFileInfo.getFileName()));
|
indexFileInfo.getMaxRecordKey(), indexFileInfo.getFileId()));
|
||||||
} else {
|
} else {
|
||||||
if (!partitionToFilesWithNoRanges.containsKey(partition)) {
|
if (!partitionToFilesWithNoRanges.containsKey(partition)) {
|
||||||
partitionToFilesWithNoRanges.put(partition, new HashSet<>());
|
partitionToFilesWithNoRanges.put(partition, new HashSet<>());
|
||||||
}
|
}
|
||||||
partitionToFilesWithNoRanges.get(partition).add(indexFileInfo.getFileName());
|
partitionToFilesWithNoRanges.get(partition).add(indexFileInfo.getFileId());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
partitionToFileIndexLookUpTree.put(partition, lookUpTree);
|
partitionToFileIndexLookUpTree.put(partition, lookUpTree);
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class ListBasedGlobalIndexFileFilter extends ListBasedIndexFileFilter {
|
|||||||
// for each candidate file in partition, that needs to be compared.
|
// for each candidate file in partition, that needs to be compared.
|
||||||
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
||||||
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
||||||
toReturn.add(indexInfo.getFileName());
|
toReturn.add(indexInfo.getFileId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class ListBasedIndexFileFilter implements IndexFileFilter {
|
|||||||
// for each candidate file in partition, that needs to be compared.
|
// for each candidate file in partition, that needs to be compared.
|
||||||
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
for (BloomIndexFileInfo indexInfo : indexInfos) {
|
||||||
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
if (shouldCompareWithFile(indexInfo, recordKey)) {
|
||||||
toReturn.add(indexInfo.getFileName());
|
toReturn.add(indexInfo.getFileId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
|||||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||||
import com.uber.hoodie.common.util.ReflectionUtils;
|
import com.uber.hoodie.common.util.ReflectionUtils;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException;
|
||||||
@@ -60,7 +61,6 @@ import org.apache.spark.api.java.JavaPairRDD;
|
|||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function2;
|
import org.apache.spark.api.java.function.Function2;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -123,9 +123,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
public JavaPairRDD<HoodieKey, Optional<Pair<String, String>>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
|
||||||
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
|
||||||
//TODO : Change/Remove filterExists in HoodieReadClient() and revisit
|
|
||||||
throw new UnsupportedOperationException("HBase index does not implement check exist");
|
throw new UnsupportedOperationException("HBase index does not implement check exist");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -297,7 +296,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
|
|||||||
}
|
}
|
||||||
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
|
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN,
|
||||||
Bytes.toBytes(loc.get().getCommitTime()));
|
Bytes.toBytes(loc.get().getInstantTime()));
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN,
|
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN,
|
||||||
Bytes.toBytes(loc.get().getFileId()));
|
Bytes.toBytes(loc.get().getFileId()));
|
||||||
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN,
|
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN,
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ import org.apache.spark.util.SizeEstimator;
|
|||||||
/**
|
/**
|
||||||
* IO Operation to append data onto an existing file.
|
* IO Operation to append data onto an existing file.
|
||||||
*/
|
*/
|
||||||
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWriteHandle<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieAppendHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieAppendHandle.class);
|
||||||
// This acts as the sequenceID for records written
|
// This acts as the sequenceID for records written
|
||||||
@@ -114,7 +114,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
RealtimeView rtView = hoodieTable.getRTFileSystemView();
|
RealtimeView rtView = hoodieTable.getRTFileSystemView();
|
||||||
Option<FileSlice> fileSlice = rtView.getLatestFileSlice(partitionPath, fileId);
|
Option<FileSlice> fileSlice = rtView.getLatestFileSlice(partitionPath, fileId);
|
||||||
// Set the base commit time as the current commitTime for new inserts into log files
|
// Set the base commit time as the current commitTime for new inserts into log files
|
||||||
String baseInstantTime = commitTime;
|
String baseInstantTime = instantTime;
|
||||||
if (fileSlice.isPresent()) {
|
if (fileSlice.isPresent()) {
|
||||||
baseInstantTime = fileSlice.get().getBaseInstantTime();
|
baseInstantTime = fileSlice.get().getBaseInstantTime();
|
||||||
} else {
|
} else {
|
||||||
@@ -134,11 +134,11 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
((HoodieDeltaWriteStat) writeStatus.getStat()).setLogVersion(currentLogFile.getLogVersion());
|
((HoodieDeltaWriteStat) writeStatus.getStat()).setLogVersion(currentLogFile.getLogVersion());
|
||||||
((HoodieDeltaWriteStat) writeStatus.getStat()).setLogOffset(writer.getCurrentSize());
|
((HoodieDeltaWriteStat) writeStatus.getStat()).setLogOffset(writer.getCurrentSize());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Error in update task at commit " + commitTime, e);
|
logger.error("Error in update task at commit " + instantTime, e);
|
||||||
writeStatus.setGlobalError(e);
|
writeStatus.setGlobalError(e);
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException(
|
||||||
"Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
|
"Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
|
||||||
+ commitTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
|
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath()
|
||||||
+ partitionPath, e);
|
+ partitionPath, e);
|
||||||
}
|
}
|
||||||
Path path = new Path(partitionPath, writer.getLogFile().getFileName());
|
Path path = new Path(partitionPath, writer.getLogFile().getFileName());
|
||||||
@@ -154,13 +154,13 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
if (avroRecord.isPresent()) {
|
if (avroRecord.isPresent()) {
|
||||||
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
||||||
avroRecord = Optional.of(rewriteRecord((GenericRecord) avroRecord.get()));
|
avroRecord = Optional.of(rewriteRecord((GenericRecord) avroRecord.get()));
|
||||||
String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
|
String seqId = HoodieRecord.generateSequenceId(instantTime, TaskContext.getPartitionId(),
|
||||||
recordIndex.getAndIncrement());
|
recordIndex.getAndIncrement());
|
||||||
HoodieAvroUtils
|
HoodieAvroUtils
|
||||||
.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
|
.addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(),
|
||||||
hoodieRecord.getPartitionPath(), fileId);
|
hoodieRecord.getPartitionPath(), fileId);
|
||||||
HoodieAvroUtils
|
HoodieAvroUtils
|
||||||
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), commitTime, seqId);
|
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), instantTime, seqId);
|
||||||
// If currentLocation is present, then this is an update
|
// If currentLocation is present, then this is an update
|
||||||
if (hoodieRecord.getCurrentLocation() != null) {
|
if (hoodieRecord.getCurrentLocation() != null) {
|
||||||
updatedRecordsWritten++;
|
updatedRecordsWritten++;
|
||||||
@@ -200,7 +200,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
private void doAppend(Map<HoodieLogBlock.HeaderMetadataType, String> header) {
|
private void doAppend(Map<HoodieLogBlock.HeaderMetadataType, String> header) {
|
||||||
try {
|
try {
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, commitTime);
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchema.toString());
|
||||||
if (recordList.size() > 0) {
|
if (recordList.size() > 0) {
|
||||||
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, header));
|
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, header));
|
||||||
@@ -286,7 +286,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
|
|
||||||
private void writeToBuffer(HoodieRecord<T> record) {
|
private void writeToBuffer(HoodieRecord<T> record) {
|
||||||
// update the new location of the record, so we know where to find it next
|
// update the new location of the record, so we know where to find it next
|
||||||
record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
|
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
|
||||||
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
|
Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
|
||||||
if (indexedRecord.isPresent()) {
|
if (indexedRecord.isPresent()) {
|
||||||
recordList.add(indexedRecord.get());
|
recordList.add(indexedRecord.get());
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ import org.apache.log4j.LogManager;
|
|||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWriteHandle<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieCreateHandle.class);
|
||||||
|
|
||||||
@@ -101,7 +101,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieIOH
|
|||||||
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
|
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
|
||||||
storageWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
|
storageWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
|
||||||
// update the new location of record, so we know where to find it next
|
// update the new location of record, so we know where to find it next
|
||||||
record.setNewLocation(new HoodieRecordLocation(commitTime, writeStatus.getFileId()));
|
record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));
|
||||||
recordsWritten++;
|
recordsWritten++;
|
||||||
insertRecordsWritten++;
|
insertRecordsWritten++;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -18,167 +18,25 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.io;
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
import com.uber.hoodie.WriteStatus;
|
|
||||||
import com.uber.hoodie.common.io.storage.HoodieWrapperFileSystem;
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecord;
|
|
||||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
|
||||||
import com.uber.hoodie.common.util.FailSafeConsistencyGuard;
|
|
||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
|
||||||
import com.uber.hoodie.common.util.HoodieTimer;
|
|
||||||
import com.uber.hoodie.common.util.NoOpConsistencyGuard;
|
|
||||||
import com.uber.hoodie.common.util.ReflectionUtils;
|
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.exception.HoodieException;
|
|
||||||
import com.uber.hoodie.exception.HoodieIOException;
|
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Optional;
|
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.apache.spark.TaskContext;
|
|
||||||
|
|
||||||
|
|
||||||
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieIOHandle.class);
|
protected final String instantTime;
|
||||||
protected final String commitTime;
|
|
||||||
protected final String fileId;
|
|
||||||
protected final String writeToken;
|
|
||||||
protected final HoodieWriteConfig config;
|
protected final HoodieWriteConfig config;
|
||||||
protected final FileSystem fs;
|
protected final FileSystem fs;
|
||||||
protected final HoodieTable<T> hoodieTable;
|
protected final HoodieTable<T> hoodieTable;
|
||||||
protected final Schema originalSchema;
|
|
||||||
protected final Schema writerSchema;
|
|
||||||
protected HoodieTimer timer;
|
|
||||||
protected final WriteStatus writeStatus;
|
|
||||||
|
|
||||||
public HoodieIOHandle(HoodieWriteConfig config, String commitTime, String fileId,
|
HoodieIOHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T> hoodieTable) {
|
||||||
HoodieTable<T> hoodieTable) {
|
this.instantTime = instantTime;
|
||||||
this.commitTime = commitTime;
|
|
||||||
this.fileId = fileId;
|
|
||||||
this.writeToken = makeSparkWriteToken();
|
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.fs = getFileSystem(hoodieTable, config);
|
|
||||||
this.hoodieTable = hoodieTable;
|
this.hoodieTable = hoodieTable;
|
||||||
this.originalSchema = new Schema.Parser().parse(config.getSchema());
|
this.fs = getFileSystem();
|
||||||
this.writerSchema = createHoodieWriteSchema(originalSchema);
|
|
||||||
this.timer = new HoodieTimer().startTimer();
|
|
||||||
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
|
|
||||||
!hoodieTable.getIndex().isImplicitWithStorage(),
|
|
||||||
config.getWriteStatusFailureFraction());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static FileSystem getFileSystem(HoodieTable hoodieTable, HoodieWriteConfig config) {
|
protected abstract FileSystem getFileSystem();
|
||||||
return new HoodieWrapperFileSystem(hoodieTable.getMetaClient().getFs(), config.isConsistencyCheckEnabled()
|
|
||||||
? new FailSafeConsistencyGuard(hoodieTable.getMetaClient().getFs(),
|
|
||||||
config.getMaxConsistencyChecks(), config.getInitialConsistencyCheckIntervalMs(),
|
|
||||||
config.getMaxConsistencyCheckIntervalMs()) : new NoOpConsistencyGuard());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate a write token based on the currently running spark task and its place in the spark dag.
|
|
||||||
*/
|
|
||||||
private static String makeSparkWriteToken() {
|
|
||||||
return FSUtils.makeWriteToken(TaskContext.getPartitionId(), TaskContext.get().stageId(),
|
|
||||||
TaskContext.get().taskAttemptId());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Schema createHoodieWriteSchema(Schema originalSchema) {
|
|
||||||
return HoodieAvroUtils.addMetadataFields(originalSchema);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Path makeNewPath(String partitionPath) {
|
|
||||||
Path path = FSUtils.getPartitionPath(config.getBasePath(), partitionPath);
|
|
||||||
try {
|
|
||||||
fs.mkdirs(path); // create a new partition as needed.
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIOException("Failed to make dir " + path, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Path(path.toString(), FSUtils.makeDataFileName(commitTime, writeToken, fileId));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an empty marker file corresponding to storage writer path
|
|
||||||
* @param partitionPath Partition path
|
|
||||||
*/
|
|
||||||
protected void createMarkerFile(String partitionPath) {
|
|
||||||
Path markerPath = makeNewMarkerPath(partitionPath);
|
|
||||||
try {
|
|
||||||
logger.info("Creating Marker Path=" + markerPath);
|
|
||||||
fs.create(markerPath, false).close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieException("Failed to create marker file " + markerPath, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* THe marker path will be <base-path>/.hoodie/.temp/<instant_ts>/2019/04/25/filename
|
|
||||||
* @param partitionPath
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private Path makeNewMarkerPath(String partitionPath) {
|
|
||||||
Path markerRootPath = new Path(hoodieTable.getMetaClient().getMarkerFolderPath(commitTime));
|
|
||||||
Path path = FSUtils.getPartitionPath(markerRootPath, partitionPath);
|
|
||||||
try {
|
|
||||||
fs.mkdirs(path); // create a new partition as needed.
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new HoodieIOException("Failed to make dir " + path, e);
|
|
||||||
}
|
|
||||||
return new Path(path.toString(), FSUtils.makeMarkerFile(commitTime, writeToken, fileId));
|
|
||||||
}
|
|
||||||
|
|
||||||
public Schema getWriterSchema() {
|
|
||||||
return writerSchema;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines whether we can accept the incoming records, into the current file, depending on
|
|
||||||
* <p>
|
|
||||||
* - Whether it belongs to the same partitionPath as existing records - Whether the current file
|
|
||||||
* written bytes lt max file size
|
|
||||||
*/
|
|
||||||
public boolean canWrite(HoodieRecord record) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Perform the actual writing of the given record into the backing file.
|
|
||||||
*/
|
|
||||||
public void write(HoodieRecord record, Optional<IndexedRecord> insertValue) {
|
|
||||||
// NO_OP
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Perform the actual writing of the given record into the backing file.
|
|
||||||
*/
|
|
||||||
public void write(HoodieRecord record, Optional<IndexedRecord> avroRecord, Optional<Exception> exception) {
|
|
||||||
Optional recordMetadata = record.getData().getMetadata();
|
|
||||||
if (exception.isPresent() && exception.get() instanceof Throwable) {
|
|
||||||
// Not throwing exception from here, since we don't want to fail the entire job for a single record
|
|
||||||
writeStatus.markFailure(record, exception.get(), recordMetadata);
|
|
||||||
logger.error("Error writing record " + record, exception.get());
|
|
||||||
} else {
|
|
||||||
write(record, avroRecord);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields
|
|
||||||
* @param record
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
protected GenericRecord rewriteRecord(GenericRecord record) {
|
|
||||||
return HoodieAvroUtils.rewriteRecord(record, writerSchema);
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract WriteStatus close();
|
|
||||||
|
|
||||||
public abstract WriteStatus getWriteStatus();
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,158 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
|
import com.uber.hoodie.common.BloomFilter;
|
||||||
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.common.model.HoodieTableType;
|
||||||
|
import com.uber.hoodie.common.util.HoodieTimer;
|
||||||
|
import com.uber.hoodie.common.util.ParquetUtils;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.exception.HoodieIndexException;
|
||||||
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a bunch of keys and returns ones that are present in the file group.
|
||||||
|
*/
|
||||||
|
public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends HoodieReadHandle<T> {
|
||||||
|
|
||||||
|
private static Logger logger = LogManager.getLogger(HoodieKeyLookupHandle.class);
|
||||||
|
|
||||||
|
private final HoodieTableType tableType;
|
||||||
|
|
||||||
|
private final BloomFilter bloomFilter;
|
||||||
|
|
||||||
|
private final List<String> candidateRecordKeys;
|
||||||
|
|
||||||
|
private long totalKeysChecked;
|
||||||
|
|
||||||
|
public HoodieKeyLookupHandle(HoodieWriteConfig config, HoodieTable<T> hoodieTable,
|
||||||
|
Pair<String, String> partitionPathFilePair) {
|
||||||
|
super(config, null, hoodieTable, partitionPathFilePair);
|
||||||
|
this.tableType = hoodieTable.getMetaClient().getTableType();
|
||||||
|
this.candidateRecordKeys = new ArrayList<>();
|
||||||
|
this.totalKeysChecked = 0;
|
||||||
|
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||||
|
this.bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(hoodieTable.getHadoopConf(),
|
||||||
|
new Path(getLatestDataFile().getPath()));
|
||||||
|
logger.info(String.format("Read bloom filter from %s in %d ms", partitionPathFilePair, timer.endTimer()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||||
|
*/
|
||||||
|
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
|
||||||
|
List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
|
||||||
|
List<String> foundRecordKeys = new ArrayList<>();
|
||||||
|
try {
|
||||||
|
// Load all rowKeys from the file, to double-confirm
|
||||||
|
if (!candidateRecordKeys.isEmpty()) {
|
||||||
|
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||||
|
Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
|
||||||
|
new HashSet<>(candidateRecordKeys));
|
||||||
|
foundRecordKeys.addAll(fileRowKeys);
|
||||||
|
logger.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
|
||||||
|
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
|
||||||
|
if (logger.isDebugEnabled()) {
|
||||||
|
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new HoodieIndexException("Error checking candidate keys against file.", e);
|
||||||
|
}
|
||||||
|
return foundRecordKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the key for look up.
|
||||||
|
*/
|
||||||
|
public void addKey(String recordKey) {
|
||||||
|
// check record key against bloom filter of current file & add to possible keys if needed
|
||||||
|
if (bloomFilter.mightContain(recordKey)) {
|
||||||
|
if (logger.isDebugEnabled()) {
|
||||||
|
logger.debug("Record key " + recordKey + " matches bloom filter in " + partitionPathFilePair);
|
||||||
|
}
|
||||||
|
candidateRecordKeys.add(recordKey);
|
||||||
|
}
|
||||||
|
totalKeysChecked++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Of all the keys, that were added, return a list of keys that were actually found in the file group.
|
||||||
|
*/
|
||||||
|
public KeyLookupResult getLookupResult() {
|
||||||
|
if (logger.isDebugEnabled()) {
|
||||||
|
logger.debug("#The candidate row keys for " + partitionPathFilePair + " => " + candidateRecordKeys);
|
||||||
|
}
|
||||||
|
|
||||||
|
HoodieDataFile dataFile = getLatestDataFile();
|
||||||
|
List<String> matchingKeys = checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys,
|
||||||
|
new Path(dataFile.getPath()));
|
||||||
|
logger.info(String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)",
|
||||||
|
totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(),
|
||||||
|
matchingKeys.size()));
|
||||||
|
return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(),
|
||||||
|
dataFile.getCommitTime(), matchingKeys);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encapsulates the result from a key lookup
|
||||||
|
*/
|
||||||
|
public static class KeyLookupResult {
|
||||||
|
|
||||||
|
private final String fileId;
|
||||||
|
private final String baseInstantTime;
|
||||||
|
private final List<String> matchingRecordKeys;
|
||||||
|
private final String partitionPath;
|
||||||
|
|
||||||
|
public KeyLookupResult(String fileId, String partitionPath, String baseInstantTime,
|
||||||
|
List<String> matchingRecordKeys) {
|
||||||
|
this.fileId = fileId;
|
||||||
|
this.partitionPath = partitionPath;
|
||||||
|
this.baseInstantTime = baseInstantTime;
|
||||||
|
this.matchingRecordKeys = matchingRecordKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFileId() {
|
||||||
|
return fileId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBaseInstantTime() {
|
||||||
|
return baseInstantTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPartitionPath() {
|
||||||
|
return partitionPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getMatchingRecordKeys() {
|
||||||
|
return matchingRecordKeys;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -28,6 +28,7 @@ import com.uber.hoodie.common.model.HoodieWriteStat;
|
|||||||
import com.uber.hoodie.common.model.HoodieWriteStat.RuntimeStats;
|
import com.uber.hoodie.common.model.HoodieWriteStat.RuntimeStats;
|
||||||
import com.uber.hoodie.common.util.DefaultSizeEstimator;
|
import com.uber.hoodie.common.util.DefaultSizeEstimator;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
import com.uber.hoodie.common.util.HoodieRecordSizeEstimator;
|
import com.uber.hoodie.common.util.HoodieRecordSizeEstimator;
|
||||||
import com.uber.hoodie.common.util.collection.ExternalSpillableMap;
|
import com.uber.hoodie.common.util.collection.ExternalSpillableMap;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
@@ -42,6 +43,7 @@ import java.util.Iterator;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -50,7 +52,7 @@ import org.apache.log4j.Logger;
|
|||||||
import org.apache.spark.TaskContext;
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
@SuppressWarnings("Duplicates")
|
@SuppressWarnings("Duplicates")
|
||||||
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHandle<T> {
|
public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWriteHandle<T> {
|
||||||
|
|
||||||
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
private static Logger logger = LogManager.getLogger(HoodieMergeHandle.class);
|
||||||
|
|
||||||
@@ -85,6 +87,64 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
.getPartitionPath(), dataFileToBeMerged);
|
.getPartitionPath(), dataFileToBeMerged);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Schema createHoodieWriteSchema(Schema originalSchema) {
|
||||||
|
return HoodieAvroUtils.addMetadataFields(originalSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path makeNewPath(String partitionPath) {
|
||||||
|
Path path = FSUtils.getPartitionPath(config.getBasePath(), partitionPath);
|
||||||
|
try {
|
||||||
|
fs.mkdirs(path); // create a new partition as needed.
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Failed to make dir " + path, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, writeToken, fileId));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Schema getWriterSchema() {
|
||||||
|
return writerSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether we can accept the incoming records, into the current file, depending on
|
||||||
|
* <p>
|
||||||
|
* - Whether it belongs to the same partitionPath as existing records - Whether the current file written bytes lt max
|
||||||
|
* file size
|
||||||
|
*/
|
||||||
|
public boolean canWrite(HoodieRecord record) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform the actual writing of the given record into the backing file.
|
||||||
|
*/
|
||||||
|
public void write(HoodieRecord record, Optional<IndexedRecord> insertValue) {
|
||||||
|
// NO_OP
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform the actual writing of the given record into the backing file.
|
||||||
|
*/
|
||||||
|
public void write(HoodieRecord record, Optional<IndexedRecord> avroRecord, Optional<Exception> exception) {
|
||||||
|
Optional recordMetadata = record.getData().getMetadata();
|
||||||
|
if (exception.isPresent() && exception.get() instanceof Throwable) {
|
||||||
|
// Not throwing exception from here, since we don't want to fail the entire job for a single record
|
||||||
|
writeStatus.markFailure(record, exception.get(), recordMetadata);
|
||||||
|
logger.error("Error writing record " + record, exception.get());
|
||||||
|
} else {
|
||||||
|
write(record, avroRecord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields
|
||||||
|
*/
|
||||||
|
protected GenericRecord rewriteRecord(GenericRecord record) {
|
||||||
|
return HoodieAvroUtils.rewriteRecord(record, writerSchema);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract old file path, initialize StorageWriter and WriteStatus
|
* Extract old file path, initialize StorageWriter and WriteStatus
|
||||||
*/
|
*/
|
||||||
@@ -95,14 +155,14 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
String latestValidFilePath = dataFileToBeMerged.getFileName();
|
String latestValidFilePath = dataFileToBeMerged.getFileName();
|
||||||
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
|
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
|
||||||
|
|
||||||
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, commitTime,
|
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime,
|
||||||
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
||||||
partitionMetadata.trySave(TaskContext.getPartitionId());
|
partitionMetadata.trySave(TaskContext.getPartitionId());
|
||||||
|
|
||||||
oldFilePath = new Path(
|
oldFilePath = new Path(
|
||||||
config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
||||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/") + FSUtils
|
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
||||||
.makeDataFileName(commitTime, writeToken, fileId)).toString();
|
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString();
|
||||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||||
|
|
||||||
logger.info(String
|
logger.info(String
|
||||||
@@ -120,13 +180,13 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
|
|
||||||
// Create the writer for writing the new version file
|
// Create the writer for writing the new version file
|
||||||
storageWriter = HoodieStorageWriterFactory
|
storageWriter = HoodieStorageWriterFactory
|
||||||
.getStorageWriter(commitTime, newFilePath, hoodieTable, config, writerSchema);
|
.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema);
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
logger.error("Error in update task at commit " + commitTime, io);
|
logger.error("Error in update task at commit " + instantTime, io);
|
||||||
writeStatus.setGlobalError(io);
|
writeStatus.setGlobalError(io);
|
||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException(
|
||||||
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
|
"Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
|
||||||
+ commitTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
|
+ instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,7 +208,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieIOHa
|
|||||||
partitionPath = record.getPartitionPath();
|
partitionPath = record.getPartitionPath();
|
||||||
keyToNewRecords.put(record.getRecordKey(), record);
|
keyToNewRecords.put(record.getRecordKey(), record);
|
||||||
// update the new location of the record, so we know where to find it next
|
// update the new location of the record, so we know where to find it next
|
||||||
record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
|
record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
|
||||||
}
|
}
|
||||||
logger.info("Number of entries in MemoryBasedMap => "
|
logger.info("Number of entries in MemoryBasedMap => "
|
||||||
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
|
+ ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.common.util.ParquetUtils;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract range information for a given file slice
|
||||||
|
*/
|
||||||
|
public class HoodieRangeInfoHandle<T extends HoodieRecordPayload> extends HoodieReadHandle<T> {
|
||||||
|
|
||||||
|
public HoodieRangeInfoHandle(HoodieWriteConfig config, HoodieTable<T> hoodieTable,
|
||||||
|
Pair<String, String> partitionPathFilePair) {
|
||||||
|
super(config, null, hoodieTable, partitionPathFilePair);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] getMinMaxKeys() {
|
||||||
|
HoodieDataFile dataFile = getLatestDataFile();
|
||||||
|
return ParquetUtils.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), new Path(dataFile.getPath()));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
|
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for read operations done logically on the file group.
|
||||||
|
*/
|
||||||
|
public abstract class HoodieReadHandle<T extends HoodieRecordPayload> extends HoodieIOHandle {
|
||||||
|
|
||||||
|
protected final Pair<String, String> partitionPathFilePair;
|
||||||
|
|
||||||
|
public HoodieReadHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T> hoodieTable,
|
||||||
|
Pair<String, String> partitionPathFilePair) {
|
||||||
|
super(config, instantTime, hoodieTable);
|
||||||
|
this.partitionPathFilePair = partitionPathFilePair;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected FileSystem getFileSystem() {
|
||||||
|
return hoodieTable.getMetaClient().getFs();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Pair<String, String> getPartitionPathFilePair() {
|
||||||
|
return partitionPathFilePair;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFileId() {
|
||||||
|
return partitionPathFilePair.getRight();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected HoodieDataFile getLatestDataFile() {
|
||||||
|
return hoodieTable.getROFileSystemView()
|
||||||
|
.getLatestDataFile(partitionPathFilePair.getLeft(), partitionPathFilePair.getRight()).get();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,183 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.uber.hoodie.io;
|
||||||
|
|
||||||
|
import com.uber.hoodie.WriteStatus;
|
||||||
|
import com.uber.hoodie.common.io.storage.HoodieWrapperFileSystem;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecord;
|
||||||
|
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||||
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
|
import com.uber.hoodie.common.util.FailSafeConsistencyGuard;
|
||||||
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
|
import com.uber.hoodie.common.util.HoodieTimer;
|
||||||
|
import com.uber.hoodie.common.util.NoOpConsistencyGuard;
|
||||||
|
import com.uber.hoodie.common.util.ReflectionUtils;
|
||||||
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.exception.HoodieException;
|
||||||
|
import com.uber.hoodie.exception.HoodieIOException;
|
||||||
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Optional;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.TaskContext;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for all write operations logically performed at the file group level.
|
||||||
|
*/
|
||||||
|
public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends HoodieIOHandle {
|
||||||
|
|
||||||
|
private static Logger logger = LogManager.getLogger(HoodieWriteHandle.class);
|
||||||
|
protected final Schema originalSchema;
|
||||||
|
protected final Schema writerSchema;
|
||||||
|
protected HoodieTimer timer;
|
||||||
|
protected final WriteStatus writeStatus;
|
||||||
|
protected final String fileId;
|
||||||
|
protected final String writeToken;
|
||||||
|
|
||||||
|
public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String fileId, HoodieTable<T> hoodieTable) {
|
||||||
|
super(config, instantTime, hoodieTable);
|
||||||
|
this.fileId = fileId;
|
||||||
|
this.writeToken = makeSparkWriteToken();
|
||||||
|
this.originalSchema = new Schema.Parser().parse(config.getSchema());
|
||||||
|
this.writerSchema = createHoodieWriteSchema(originalSchema);
|
||||||
|
this.timer = new HoodieTimer().startTimer();
|
||||||
|
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
|
||||||
|
!hoodieTable.getIndex().isImplicitWithStorage(),
|
||||||
|
config.getWriteStatusFailureFraction());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FileSystem getFileSystem(HoodieTable hoodieTable, HoodieWriteConfig config) {
|
||||||
|
return new HoodieWrapperFileSystem(hoodieTable.getMetaClient().getFs(), config.isConsistencyCheckEnabled()
|
||||||
|
? new FailSafeConsistencyGuard(hoodieTable.getMetaClient().getFs(),
|
||||||
|
config.getMaxConsistencyChecks(), config.getInitialConsistencyCheckIntervalMs(),
|
||||||
|
config.getMaxConsistencyCheckIntervalMs()) : new NoOpConsistencyGuard());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a write token based on the currently running spark task and its place in the spark dag.
|
||||||
|
*/
|
||||||
|
private static String makeSparkWriteToken() {
|
||||||
|
return FSUtils.makeWriteToken(TaskContext.getPartitionId(), TaskContext.get().stageId(),
|
||||||
|
TaskContext.get().taskAttemptId());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Schema createHoodieWriteSchema(Schema originalSchema) {
|
||||||
|
return HoodieAvroUtils.addMetadataFields(originalSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path makeNewPath(String partitionPath) {
|
||||||
|
Path path = FSUtils.getPartitionPath(config.getBasePath(), partitionPath);
|
||||||
|
try {
|
||||||
|
fs.mkdirs(path); // create a new partition as needed.
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Failed to make dir " + path, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, writeToken, fileId));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an empty marker file corresponding to storage writer path
|
||||||
|
*
|
||||||
|
* @param partitionPath Partition path
|
||||||
|
*/
|
||||||
|
protected void createMarkerFile(String partitionPath) {
|
||||||
|
Path markerPath = makeNewMarkerPath(partitionPath);
|
||||||
|
try {
|
||||||
|
logger.info("Creating Marker Path=" + markerPath);
|
||||||
|
fs.create(markerPath, false).close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieException("Failed to create marker file " + markerPath, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* THe marker path will be <base-path>/.hoodie/.temp/<instant_ts>/2019/04/25/filename
|
||||||
|
*/
|
||||||
|
private Path makeNewMarkerPath(String partitionPath) {
|
||||||
|
Path markerRootPath = new Path(hoodieTable.getMetaClient().getMarkerFolderPath(instantTime));
|
||||||
|
Path path = FSUtils.getPartitionPath(markerRootPath, partitionPath);
|
||||||
|
try {
|
||||||
|
fs.mkdirs(path); // create a new partition as needed.
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIOException("Failed to make dir " + path, e);
|
||||||
|
}
|
||||||
|
return new Path(path.toString(), FSUtils.makeMarkerFile(instantTime, writeToken, fileId));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Schema getWriterSchema() {
|
||||||
|
return writerSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether we can accept the incoming records, into the current file, depending on
|
||||||
|
* <p>
|
||||||
|
* - Whether it belongs to the same partitionPath as existing records - Whether the current file written bytes lt max
|
||||||
|
* file size
|
||||||
|
*/
|
||||||
|
public boolean canWrite(HoodieRecord record) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform the actual writing of the given record into the backing file.
|
||||||
|
*/
|
||||||
|
public void write(HoodieRecord record, Optional<IndexedRecord> insertValue) {
|
||||||
|
// NO_OP
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform the actual writing of the given record into the backing file.
|
||||||
|
*/
|
||||||
|
public void write(HoodieRecord record, Optional<IndexedRecord> avroRecord, Optional<Exception> exception) {
|
||||||
|
Optional recordMetadata = record.getData().getMetadata();
|
||||||
|
if (exception.isPresent() && exception.get() instanceof Throwable) {
|
||||||
|
// Not throwing exception from here, since we don't want to fail the entire job for a single record
|
||||||
|
writeStatus.markFailure(record, exception.get(), recordMetadata);
|
||||||
|
logger.error("Error writing record " + record, exception.get());
|
||||||
|
} else {
|
||||||
|
write(record, avroRecord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields
|
||||||
|
*/
|
||||||
|
protected GenericRecord rewriteRecord(GenericRecord record) {
|
||||||
|
return HoodieAvroUtils.rewriteRecord(record, writerSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract WriteStatus close();
|
||||||
|
|
||||||
|
public abstract WriteStatus getWriteStatus();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected FileSystem getFileSystem() {
|
||||||
|
return new HoodieWrapperFileSystem(hoodieTable.getMetaClient().getFs(), config.isConsistencyCheckEnabled()
|
||||||
|
? new FailSafeConsistencyGuard(hoodieTable.getMetaClient().getFs(),
|
||||||
|
config.getMaxConsistencyChecks(), config.getInitialConsistencyCheckIntervalMs(),
|
||||||
|
config.getMaxConsistencyCheckIntervalMs()) : new NoOpConsistencyGuard());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -500,7 +500,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
/**
|
/**
|
||||||
* Helper class for a small file's location and its actual size on disk
|
* Helper class for a small file's location and its actual size on disk
|
||||||
*/
|
*/
|
||||||
class SmallFile implements Serializable {
|
static class SmallFile implements Serializable {
|
||||||
|
|
||||||
HoodieRecordLocation location;
|
HoodieRecordLocation location;
|
||||||
long sizeBytes;
|
long sizeBytes;
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ public class WorkloadStat implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
long addUpdates(HoodieRecordLocation location, long numUpdates) {
|
long addUpdates(HoodieRecordLocation location, long numUpdates) {
|
||||||
updateLocationToCount.put(location.getFileId(), Pair.of(location.getCommitTime(), numUpdates));
|
updateLocationToCount.put(location.getFileId(), Pair.of(location.getInstantTime(), numUpdates));
|
||||||
return this.numUpdates += numUpdates;
|
return this.numUpdates += numUpdates;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ public class TestHoodieClientBase implements Serializable {
|
|||||||
for (HoodieRecord rec : taggedRecords) {
|
for (HoodieRecord rec : taggedRecords) {
|
||||||
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
|
assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
|
||||||
assertEquals("All records should have commit time " + commitTime + ", since updates were made",
|
assertEquals("All records should have commit time " + commitTime + ", since updates were made",
|
||||||
rec.getCurrentLocation().getCommitTime(), commitTime);
|
rec.getCurrentLocation().getInstantTime(), commitTime);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ public class TestHbaseIndex {
|
|||||||
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
|
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
|
||||||
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
|
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
|
||||||
assertTrue(javaRDD.filter(
|
assertTrue(javaRDD.filter(
|
||||||
record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime()
|
record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime()
|
||||||
.equals(newCommitTime))).distinct().count() == 200);
|
.equals(newCommitTime))).distinct().count() == 200);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,8 +36,10 @@ import com.uber.hoodie.common.model.HoodieTestUtils;
|
|||||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||||
import com.uber.hoodie.common.util.FSUtils;
|
import com.uber.hoodie.common.util.FSUtils;
|
||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
|
import com.uber.hoodie.common.util.collection.Pair;
|
||||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.io.HoodieKeyLookupHandle;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -200,10 +202,10 @@ public class TestHoodieBloomIndex {
|
|||||||
// no longer sorted, but should have same files.
|
// no longer sorted, but should have same files.
|
||||||
|
|
||||||
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
||||||
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2")),
|
||||||
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1")),
|
||||||
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3", "000", "000")),
|
||||||
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")));
|
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4", "001", "003")));
|
||||||
assertEquals(expected, filesList);
|
assertEquals(expected, filesList);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -279,7 +281,7 @@ public class TestHoodieBloomIndex {
|
|||||||
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
|
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
|
||||||
record4.getRecordKey());
|
record4.getRecordKey());
|
||||||
|
|
||||||
List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
|
List<String> results = HoodieKeyLookupHandle.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
|
||||||
new Path(basePath + "/2016/01/31/" + filename));
|
new Path(basePath + "/2016/01/31/" + filename));
|
||||||
assertEquals(results.size(), 2);
|
assertEquals(results.size(), 2);
|
||||||
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
|
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
|
||||||
@@ -417,10 +419,11 @@ public class TestHoodieBloomIndex {
|
|||||||
|
|
||||||
// Let's tag
|
// Let's tag
|
||||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
|
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
|
||||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);
|
JavaPairRDD<HoodieKey, Optional<Pair<String, String>>> taggedRecordRDD = bloomIndex
|
||||||
|
.fetchRecordLocation(keysRDD, jsc, table);
|
||||||
|
|
||||||
// Should not find any files
|
// Should not find any files
|
||||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
for (Tuple2<HoodieKey, Optional<Pair<String, String>>> record : taggedRecordRDD.collect()) {
|
||||||
assertTrue(!record._2.isPresent());
|
assertTrue(!record._2.isPresent());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -438,18 +441,16 @@ public class TestHoodieBloomIndex {
|
|||||||
taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);
|
taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);
|
||||||
|
|
||||||
// Check results
|
// Check results
|
||||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
for (Tuple2<HoodieKey, Optional<Pair<String, String>>> record : taggedRecordRDD.collect()) {
|
||||||
if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||||
assertTrue(record._2.isPresent());
|
assertTrue(record._2.isPresent());
|
||||||
Path path1 = new Path(record._2.get());
|
assertEquals(FSUtils.getFileId(filename1), record._2.get().getRight());
|
||||||
assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName()));
|
|
||||||
} else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
} else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||||
assertTrue(record._2.isPresent());
|
assertTrue(record._2.isPresent());
|
||||||
Path path2 = new Path(record._2.get());
|
|
||||||
if (record._1.getPartitionPath().equals("2015/01/31")) {
|
if (record._1.getPartitionPath().equals("2015/01/31")) {
|
||||||
assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path2.getName()));
|
assertEquals(FSUtils.getFileId(filename3), record._2.get().getRight());
|
||||||
} else {
|
} else {
|
||||||
assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName()));
|
assertEquals(FSUtils.getFileId(filename2), record._2.get().getRight());
|
||||||
}
|
}
|
||||||
} else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
} else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||||
assertTrue(!record._2.isPresent());
|
assertTrue(!record._2.isPresent());
|
||||||
|
|||||||
@@ -18,7 +18,11 @@
|
|||||||
|
|
||||||
package com.uber.hoodie.index.bloom;
|
package com.uber.hoodie.index.bloom;
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||||
@@ -32,16 +36,17 @@ import com.uber.hoodie.common.util.FSUtils;
|
|||||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
import com.uber.hoodie.table.HoodieTable;
|
import com.uber.hoodie.table.HoodieTable;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
@@ -154,20 +159,20 @@ public class TestHoodieGlobalBloomIndex {
|
|||||||
|
|
||||||
Map<String, BloomIndexFileInfo> filesMap = toFileMap(filesList);
|
Map<String, BloomIndexFileInfo> filesMap = toFileMap(filesList);
|
||||||
// key ranges checks
|
// key ranges checks
|
||||||
assertNull(filesMap.get("2016/04/01/2_0_20160401010101.parquet").getMaxRecordKey());
|
assertNull(filesMap.get("2016/04/01/2").getMaxRecordKey());
|
||||||
assertNull(filesMap.get("2016/04/01/2_0_20160401010101.parquet").getMinRecordKey());
|
assertNull(filesMap.get("2016/04/01/2").getMinRecordKey());
|
||||||
assertFalse(filesMap.get("2015/03/12/1_0_20150312101010.parquet").hasKeyRanges());
|
assertFalse(filesMap.get("2015/03/12/1").hasKeyRanges());
|
||||||
assertNotNull(filesMap.get("2015/03/12/3_0_20150312101010.parquet").getMaxRecordKey());
|
assertNotNull(filesMap.get("2015/03/12/3").getMaxRecordKey());
|
||||||
assertNotNull(filesMap.get("2015/03/12/3_0_20150312101010.parquet").getMinRecordKey());
|
assertNotNull(filesMap.get("2015/03/12/3").getMinRecordKey());
|
||||||
assertTrue(filesMap.get("2015/03/12/3_0_20150312101010.parquet").hasKeyRanges());
|
assertTrue(filesMap.get("2015/03/12/3").hasKeyRanges());
|
||||||
|
|
||||||
Map<String, BloomIndexFileInfo> expected = new HashMap<>();
|
Map<String, BloomIndexFileInfo> expected = new HashMap<>();
|
||||||
expected.put("2016/04/01/2_0_20160401010101.parquet", new BloomIndexFileInfo("2_0_20160401010101.parquet"));
|
expected.put("2016/04/01/2", new BloomIndexFileInfo("2"));
|
||||||
expected.put("2015/03/12/1_0_20150312101010.parquet", new BloomIndexFileInfo("1_0_20150312101010.parquet"));
|
expected.put("2015/03/12/1", new BloomIndexFileInfo("1"));
|
||||||
expected.put("2015/03/12/3_0_20150312101010.parquet",
|
expected.put("2015/03/12/3",
|
||||||
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000"));
|
new BloomIndexFileInfo("3", "000", "000"));
|
||||||
expected.put("2015/03/12/4_0_20150312101010.parquet",
|
expected.put("2015/03/12/4",
|
||||||
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"));
|
new BloomIndexFileInfo("4", "001", "003"));
|
||||||
|
|
||||||
assertEquals(expected, filesMap);
|
assertEquals(expected, filesMap);
|
||||||
}
|
}
|
||||||
@@ -300,7 +305,7 @@ public class TestHoodieGlobalBloomIndex {
|
|||||||
private Map<String, BloomIndexFileInfo> toFileMap(List<Tuple2<String, BloomIndexFileInfo>> filesList) {
|
private Map<String, BloomIndexFileInfo> toFileMap(List<Tuple2<String, BloomIndexFileInfo>> filesList) {
|
||||||
Map<String, BloomIndexFileInfo> filesMap = new HashMap<>();
|
Map<String, BloomIndexFileInfo> filesMap = new HashMap<>();
|
||||||
for (Tuple2<String, BloomIndexFileInfo> t : filesList) {
|
for (Tuple2<String, BloomIndexFileInfo> t : filesList) {
|
||||||
filesMap.put(t._1() + "/" + t._2().getFileName(), t._2());
|
filesMap.put(t._1() + "/" + t._2().getFileId(), t._2());
|
||||||
}
|
}
|
||||||
return filesMap;
|
return filesMap;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,11 +27,11 @@ import java.io.Serializable;
|
|||||||
*/
|
*/
|
||||||
public class HoodieRecordLocation implements Serializable {
|
public class HoodieRecordLocation implements Serializable {
|
||||||
|
|
||||||
private final String commitTime;
|
private final String instantTime;
|
||||||
private final String fileId;
|
private final String fileId;
|
||||||
|
|
||||||
public HoodieRecordLocation(String commitTime, String fileId) {
|
public HoodieRecordLocation(String instantTime, String fileId) {
|
||||||
this.commitTime = commitTime;
|
this.instantTime = instantTime;
|
||||||
this.fileId = fileId;
|
this.fileId = fileId;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -44,26 +44,26 @@ public class HoodieRecordLocation implements Serializable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
HoodieRecordLocation otherLoc = (HoodieRecordLocation) o;
|
HoodieRecordLocation otherLoc = (HoodieRecordLocation) o;
|
||||||
return Objects.equal(commitTime, otherLoc.commitTime)
|
return Objects.equal(instantTime, otherLoc.instantTime)
|
||||||
&& Objects.equal(fileId, otherLoc.fileId);
|
&& Objects.equal(fileId, otherLoc.fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hashCode(commitTime, fileId);
|
return Objects.hashCode(instantTime, fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
final StringBuilder sb = new StringBuilder("HoodieRecordLocation {");
|
final StringBuilder sb = new StringBuilder("HoodieRecordLocation {");
|
||||||
sb.append("commitTime=").append(commitTime).append(", ");
|
sb.append("instantTime=").append(instantTime).append(", ");
|
||||||
sb.append("fileId=").append(fileId);
|
sb.append("fileId=").append(fileId);
|
||||||
sb.append('}');
|
sb.append('}');
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getCommitTime() {
|
public String getInstantTime() {
|
||||||
return commitTime;
|
return instantTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFileId() {
|
public String getFileId() {
|
||||||
|
|||||||
@@ -346,10 +346,10 @@ public class HoodieTestUtils {
|
|||||||
try {
|
try {
|
||||||
logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath))
|
logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath))
|
||||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId())
|
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId())
|
||||||
.overBaseCommit(location.getCommitTime()).withFs(fs).build();
|
.overBaseCommit(location.getInstantTime()).withFs(fs).build();
|
||||||
|
|
||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime());
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> {
|
logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ public class TestExternalSpillableMap {
|
|||||||
assert onDiskHoodieRecord.getKey().equals(records.get(dkey).getKey());
|
assert onDiskHoodieRecord.getKey().equals(records.get(dkey).getKey());
|
||||||
// compare the member variables of HoodieRecord not set by the constructor
|
// compare the member variables of HoodieRecord not set by the constructor
|
||||||
assert records.get(ikey).getCurrentLocation().getFileId().equals(SpillableMapTestUtils.DUMMY_FILE_ID);
|
assert records.get(ikey).getCurrentLocation().getFileId().equals(SpillableMapTestUtils.DUMMY_FILE_ID);
|
||||||
assert records.get(ikey).getCurrentLocation().getCommitTime().equals(SpillableMapTestUtils.DUMMY_COMMIT_TIME);
|
assert records.get(ikey).getCurrentLocation().getInstantTime().equals(SpillableMapTestUtils.DUMMY_COMMIT_TIME);
|
||||||
|
|
||||||
// test contains
|
// test contains
|
||||||
assertTrue(records.containsKey(ikey));
|
assertTrue(records.containsKey(ikey));
|
||||||
|
|||||||
Reference in New Issue
Block a user