1
0

Rename IO Handles & introduce stub for BucketedIndex

- UpdateHandle -> MergeHandle, InsertHandle -> CreateHandle
 - Also bunch of code cleanup in different places
This commit is contained in:
Vinoth Chandar
2017-05-14 09:44:44 -07:00
committed by prazanna
parent b0a2a23372
commit 23e7badd8a
15 changed files with 182 additions and 54 deletions

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package com.uber.hoodie.index;
import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.table.HoodieTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
/**
* An `stateless` index implementation that will using a deterministic mapping function to
* determine the fileID for a given record.
*
* Pros:
* - Fast
*
* Cons :
* - Need to tune the number of buckets per partition path manually (FIXME: Need to autotune this)
* - Could increase write amplification on copy-on-write storage since inserts always rewrite files
* - Not global.
*
*/
public class BucketedIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
private static Logger logger = LogManager.getLogger(BucketedIndex.class);
public BucketedIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
super(config, jsc);
}
private String getBucket(String recordKey) {
return String.valueOf(recordKey.hashCode() % config.getNumBucketsPerPartition());
}
@Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) {
return hoodieKeys.mapToPair(hk -> new Tuple2<>(hk, Optional.of(getBucket(hk.getRecordKey()))));
}
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
return recordRDD.map(record -> {
String bucket = getBucket(record.getRecordKey());
//HACK(vc) a non-existent commit is provided here.
record.setCurrentLocation(new HoodieRecordLocation("000", bucket));
return record;
});
}
@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> hoodieTable) throws HoodieIndexException {
return writeStatusRDD;
}
@Override
public boolean rollbackCommit(String commitTime) {
// nothing to rollback in the index.
return true;
}
}

View File

@@ -67,7 +67,7 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> hoodieTable) {
JavaRDD<HoodieKey> hoodieKeys, HoodieTable<T> table) {
throw new UnsupportedOperationException("HBase index does not implement check exist yet");
}
@@ -234,7 +234,8 @@ public class HBaseIndex<T extends HoodieRecordPayload> extends HoodieIndex<T> {
@Override
public boolean rollbackCommit(String commitTime) {
// TODO (weiy)
// Can't really rollback here. HBase only can let you go from recordKey to fileID,
// not the other way around
return true;
}
}

View File

@@ -85,13 +85,13 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
}
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> hoodieTable) {
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table) {
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
// Lookup indexes for all the partition/recordkey pair
JavaPairRDD<String, String> rowKeyFilenamePairRDD =
lookupIndex(partitionRecordKeyPairRDD, hoodieTable);
lookupIndex(partitionRecordKeyPairRDD, table);
JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD =
hoodieKeys.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));
@@ -103,7 +103,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
String fileName = keyPathTuple._2._2.get();
String partitionPath = keyPathTuple._2._1.getPartitionPath();
recordLocationPath = Optional.of(new Path(
new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath),
new Path(table.getMetaClient().getBasePath(), partitionPath),
fileName).toUri().getPath());
} else {
recordLocationPath = Optional.absent();

View File

@@ -36,7 +36,6 @@ import java.io.Serializable;
/**
* Base class for different types of indexes to determine the mapping from uuid
*
* TODO(vc): need methods for recovery and rollback
*/
public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Serializable {
protected transient JavaSparkContext jsc = null;
@@ -44,7 +43,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
public enum IndexType {
HBASE,
INMEMORY,
BLOOM
BLOOM,
BUCKETED
}
protected final HoodieWriteConfig config;
@@ -60,11 +60,11 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
* value is present, it is the path component (without scheme) of the URI underlying file
*
* @param hoodieKeys
* @param metaClient
* @param table
* @return
*/
public abstract JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> metaClient);
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table);
/**
* Looks up the index and tags each incoming record with a location of a file that contains the
@@ -95,6 +95,8 @@ public abstract class HoodieIndex<T extends HoodieRecordPayload> implements Seri
return new InMemoryHashIndex<>(config, jsc);
case BLOOM:
return new HoodieBloomIndex<>(config, jsc);
case BUCKETED:
return new BucketedIndex<>(config, jsc);
}
throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
}

View File

@@ -55,7 +55,7 @@ public class InMemoryHashIndex<T extends HoodieRecordPayload> extends HoodieInde
@Override
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> hoodieTable) {
JavaRDD<HoodieKey> hoodieKeys, final HoodieTable<T> table) {
throw new UnsupportedOperationException("InMemory index does not implement check exist yet");
}