1
0

[HUDI-2502] Refactor index in hudi-client module (#3778)

- Refactor Index to reduce Line of Code and re-use across engines.
This commit is contained in:
Y Ethan Guo
2021-10-28 01:16:00 -07:00
committed by GitHub
parent e5b6b8602c
commit 0223c442ec
70 changed files with 2196 additions and 1567 deletions

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.client;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
@@ -34,7 +35,7 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.JavaHoodieIndex;
import org.apache.hudi.index.JavaHoodieIndexFactory;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieJavaTable;
import org.apache.hudi.table.HoodieTable;
@@ -67,14 +68,15 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
// Create a Hoodie table which encapsulated the commits and files visible
HoodieJavaTable<T> table = HoodieJavaTable.create(config, (HoodieJavaEngineContext) context);
Timer.Context indexTimer = metrics.getIndexCtx();
List<HoodieRecord<T>> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table);
List<HoodieRecord<T>> recordsWithLocation = HoodieList.getList(
getIndex().tagLocation(HoodieList.of(hoodieRecords), context, table));
metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop()));
return recordsWithLocation.stream().filter(v1 -> !v1.isCurrentLocationKnown()).collect(Collectors.toList());
}
@Override
protected HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> createIndex(HoodieWriteConfig writeConfig) {
return JavaHoodieIndex.createIndex(config);
protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) {
return JavaHoodieIndexFactory.createIndex(config);
}
@Override

View File

@@ -78,7 +78,7 @@ public class HoodieJavaEngineContext extends HoodieEngineContext {
}
@Override
public <T> HoodieData<T> parallelize(List<T> data) {
public <T> HoodieData<T> parallelize(List<T> data, int parallelism) {
return HoodieList.of(data);
}

View File

@@ -1,32 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.bloom.HoodieBaseBloomIndex;
/**
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata.
*/
public class JavaHoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieBaseBloomIndex<T> {
public JavaHoodieBloomIndex(HoodieWriteConfig config) {
super(config);
}
}

View File

@@ -7,13 +7,14 @@
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.index;
@@ -21,12 +22,12 @@ package org.apache.hudi.index;
import org.apache.hudi.ApiMaturityLevel;
import org.apache.hudi.PublicAPIMethod;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.table.HoodieTable;
@@ -38,36 +39,33 @@ public abstract class JavaHoodieIndex<T extends HoodieRecordPayload> extends Hoo
super(config);
}
public static HoodieIndex createIndex(HoodieWriteConfig config) {
// first use index class config to create index.
if (!StringUtils.isNullOrEmpty(config.getIndexClass())) {
Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config);
if (!(instance instanceof HoodieIndex)) {
throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex");
}
return (JavaHoodieIndex) instance;
}
// TODO more indexes to be added
switch (config.getIndexType()) {
case INMEMORY:
return new JavaInMemoryHashIndex(config);
case BLOOM:
return new JavaHoodieBloomIndex(config);
default:
throw new HoodieIndexException("Unsupported index type " + config.getIndexType());
}
}
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
@Deprecated
@PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED)
public abstract List<WriteStatus> updateLocation(List<WriteStatus> writeStatuses,
HoodieEngineContext context,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) throws HoodieIndexException;
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
@Deprecated
@PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED)
public abstract List<HoodieRecord<T>> tagLocation(List<HoodieRecord<T>> records,
HoodieEngineContext context,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) throws HoodieIndexException;
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public HoodieData<HoodieRecord<T>> tagLocation(
HoodieData<HoodieRecord<T>> records, HoodieEngineContext context,
HoodieTable hoodieTable) throws HoodieIndexException {
return HoodieList.of(tagLocation(HoodieList.getList(records), context, hoodieTable));
}
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public HoodieData<WriteStatus> updateLocation(
HoodieData<WriteStatus> writeStatuses, HoodieEngineContext context,
HoodieTable hoodieTable) throws HoodieIndexException {
return HoodieList.of(updateLocation(HoodieList.getList(writeStatuses), context, hoodieTable));
}
}

View File

@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.index.bloom.HoodieBloomIndex;
import org.apache.hudi.index.bloom.ListBasedHoodieBloomIndexHelper;
import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex;
/**
* A factory to generate Java {@link HoodieIndex}.
*/
public final class JavaHoodieIndexFactory {
public static HoodieIndex createIndex(HoodieWriteConfig config) {
// first use index class config to create index.
if (!StringUtils.isNullOrEmpty(config.getIndexClass())) {
Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config);
if (!(instance instanceof HoodieIndex)) {
throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex");
}
return (HoodieIndex) instance;
}
// TODO more indexes to be added
switch (config.getIndexType()) {
case INMEMORY:
return new HoodieInMemoryHashIndex<>(config);
case BLOOM:
return new HoodieBloomIndex<>(config, ListBasedHoodieBloomIndexHelper.getInstance());
default:
throw new HoodieIndexException("Unsupported index type " + config.getIndexType());
}
}
}

View File

@@ -1,120 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.stream.Collectors;
/**
* Hoodie Index implementation backed by an in-memory Hash map.
* <p>
* ONLY USE FOR LOCAL TESTING
*/
@SuppressWarnings("checkstyle:LineLength")
public class JavaInMemoryHashIndex<T extends HoodieRecordPayload> extends JavaHoodieIndex<T> {
private static ConcurrentMap<HoodieKey, HoodieRecordLocation> recordLocationMap;
public JavaInMemoryHashIndex(HoodieWriteConfig config) {
super(config);
synchronized (JavaInMemoryHashIndex.class) {
if (recordLocationMap == null) {
recordLocationMap = new ConcurrentHashMap<>();
}
}
}
@Override
public List<HoodieRecord<T>> tagLocation(List<HoodieRecord<T>> records, HoodieEngineContext context,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) {
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
records.stream().forEach(record -> {
if (recordLocationMap.containsKey(record.getKey())) {
record.unseal();
record.setCurrentLocation(recordLocationMap.get(record.getKey()));
record.seal();
}
taggedRecords.add(record);
});
return taggedRecords;
}
@Override
public List<WriteStatus> updateLocation(List<WriteStatus> writeStatusList,
HoodieEngineContext context,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) {
return writeStatusList.stream().map(writeStatus -> {
for (HoodieRecord record : writeStatus.getWrittenRecords()) {
if (!writeStatus.isErrored(record.getKey())) {
HoodieKey key = record.getKey();
Option<HoodieRecordLocation> newLocation = record.getNewLocation();
if (newLocation.isPresent()) {
recordLocationMap.put(key, newLocation.get());
} else {
// Delete existing index for a deleted record
recordLocationMap.remove(key);
}
}
}
return writeStatus;
}).collect(Collectors.toList());
}
@Override
public boolean rollbackCommit(String instantTime) {
return true;
}
/**
* Only looks up by recordKey.
*/
@Override
public boolean isGlobal() {
return true;
}
/**
* Mapping is available in HBase already.
*/
@Override
public boolean canIndexLogFiles() {
return true;
}
/**
* Index needs to be explicitly updated after storage write.
*/
@Override
public boolean isImplicitWithStorage() {
return false;
}
}

View File

@@ -30,7 +30,7 @@ import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.JavaHoodieIndex;
import org.apache.hudi.index.JavaHoodieIndexFactory;
import java.util.List;
@@ -62,7 +62,7 @@ public abstract class HoodieJavaTable<T extends HoodieRecordPayload>
}
@Override
protected HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> getIndex(HoodieWriteConfig config, HoodieEngineContext context) {
return JavaHoodieIndex.createIndex(config);
protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context) {
return JavaHoodieIndexFactory.createIndex(config);
}
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata;
@@ -128,7 +129,8 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
protected void updateIndex(List<WriteStatus> writeStatuses, HoodieWriteMetadata<List<WriteStatus>> result) {
Instant indexStartTime = Instant.now();
// Update the index back
List<WriteStatus> statuses = table.getIndex().updateLocation(writeStatuses, context, table);
List<WriteStatus> statuses = HoodieList.getList(
table.getIndex().updateLocation(HoodieList.of(writeStatuses), context, table));
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
result.setWriteStatuses(statuses);
}
@@ -329,7 +331,8 @@ public abstract class BaseJavaCommitActionExecutor<T extends HoodieRecordPayload
public void updateIndexAndCommitIfNeeded(List<WriteStatus> writeStatuses, HoodieWriteMetadata result) {
Instant indexStartTime = Instant.now();
// Update the index back
List<WriteStatus> statuses = table.getIndex().updateLocation(writeStatuses, context, table);
List<WriteStatus> statuses = HoodieList.getList(
table.getIndex().updateLocation(HoodieList.of(writeStatuses), context, table));
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
result.setWriteStatuses(statuses);
result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(result));

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
import org.apache.hudi.common.model.HoodieKey;
@@ -97,8 +98,8 @@ public class JavaDeleteHelper<R> extends
dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
Instant beginTag = Instant.now();
// perform index look up to get existing location of records
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords =
table.getIndex().tagLocation(dedupedRecords, context, table);
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(
table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
// filter out non existent keys/records

View File

@@ -19,11 +19,14 @@
package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieList;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.HoodieTable;
import java.util.List;
import java.util.Map;
@@ -45,9 +48,14 @@ public class JavaWriteHelper<T extends HoodieRecordPayload,R> extends AbstractWr
}
@Override
public List<HoodieRecord<T>> deduplicateRecords(List<HoodieRecord<T>> records,
HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> index,
int parallelism) {
protected List<HoodieRecord<T>> tag(List<HoodieRecord<T>> dedupedRecords, HoodieEngineContext context, HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table) {
return HoodieList.getList(
table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
}
@Override
public List<HoodieRecord<T>> deduplicateRecords(
List<HoodieRecord<T>> records, HoodieIndex<T, ?, ?, ?> index, int parallelism) {
boolean isIndexingGlobal = index.isGlobal();
Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
HoodieKey hoodieKey = record.getKey();