[HUDI-2488][HUDI-3175] Implement async metadata indexing (#4693)
- Add a new action called INDEX, whose state transition is described in the RFC. - Changes in timeline to support the new action. - Add an index planner in ScheduleIndexActionExecutor. - Add index plan executor in RunIndexActionExecutor. - Add 3 APIs in HoodieTableMetadataWriter; a) scheduleIndex: will generate an index plan based on latest completed instant, initialize file groups and add a requested INDEX instant, b) index: executes the index plan and also takes care of writes that happened after indexing was requested, c) dropIndex: will drop index by removing the given metadata partition. - Add 2 new table configs to serve as the source of truth for inflight and completed indexes. - Support upgrade/downgrade taking care of the newly added configs. - Add tool to trigger indexing in HoodieIndexer. - Handle corner cases related to partial failures. - Abort gracefully after deleting partition and instant. - Handle other actions in timeline to consider before catching up
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieIndexCommitMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieIndexPartitionInfo;
|
||||
import org.apache.hudi.client.HoodieReadClient;
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.metadata.MetadataPartitionType;
|
||||
import org.apache.hudi.testutils.providers.SparkProvider;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestHoodieIndexer extends HoodieCommonTestHarness implements SparkProvider {
|
||||
|
||||
private static transient SparkSession spark;
|
||||
private static transient SQLContext sqlContext;
|
||||
private static transient JavaSparkContext jsc;
|
||||
private static transient HoodieSparkEngineContext context;
|
||||
|
||||
@BeforeEach
|
||||
public void init() throws IOException {
|
||||
boolean initialized = spark != null;
|
||||
if (!initialized) {
|
||||
SparkConf sparkConf = conf();
|
||||
SparkRDDWriteClient.registerClasses(sparkConf);
|
||||
HoodieReadClient.addHoodieSupport(sparkConf);
|
||||
spark = SparkSession.builder().config(sparkConf).getOrCreate();
|
||||
sqlContext = spark.sqlContext();
|
||||
jsc = new JavaSparkContext(spark.sparkContext());
|
||||
context = new HoodieSparkEngineContext(jsc);
|
||||
}
|
||||
initPath();
|
||||
metaClient = HoodieTestUtils.init(basePath, getTableType());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetRequestedPartitionTypes() {
|
||||
HoodieIndexer.Config config = new HoodieIndexer.Config();
|
||||
config.basePath = basePath;
|
||||
config.tableName = "indexer_test";
|
||||
config.indexTypes = "FILES,BLOOM_FILTERS,COLUMN_STATS";
|
||||
HoodieIndexer indexer = new HoodieIndexer(jsc, config);
|
||||
List<MetadataPartitionType> partitionTypes = indexer.getRequestedPartitionTypes(config.indexTypes);
|
||||
assertFalse(partitionTypes.contains(MetadataPartitionType.FILES));
|
||||
assertTrue(partitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS));
|
||||
assertTrue(partitionTypes.contains(MetadataPartitionType.COLUMN_STATS));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsIndexBuiltForAllRequestedTypes() {
|
||||
HoodieIndexer.Config config = new HoodieIndexer.Config();
|
||||
config.basePath = basePath;
|
||||
config.tableName = "indexer_test";
|
||||
config.indexTypes = "BLOOM_FILTERS,COLUMN_STATS";
|
||||
HoodieIndexer indexer = new HoodieIndexer(jsc, config);
|
||||
HoodieIndexCommitMetadata commitMetadata = HoodieIndexCommitMetadata.newBuilder()
|
||||
.setIndexPartitionInfos(Arrays.asList(new HoodieIndexPartitionInfo(
|
||||
1,
|
||||
MetadataPartitionType.COLUMN_STATS.getPartitionPath(),
|
||||
"0000")))
|
||||
.build();
|
||||
assertFalse(indexer.isIndexBuiltForAllRequestedTypes(commitMetadata.getIndexPartitionInfos()));
|
||||
|
||||
config.indexTypes = "COLUMN_STATS";
|
||||
indexer = new HoodieIndexer(jsc, config);
|
||||
assertTrue(indexer.isIndexBuiltForAllRequestedTypes(commitMetadata.getIndexPartitionInfos()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieEngineContext context() {
|
||||
return context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SparkSession spark() {
|
||||
return spark;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SQLContext sqlContext() {
|
||||
return sqlContext;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaSparkContext jsc() {
|
||||
return jsc;
|
||||
}
|
||||
}
|
||||
@@ -129,6 +129,7 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase {
|
||||
UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/uber_config.properties", dfs, dfsBasePath + "/config/uber_config.properties");
|
||||
UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/short_trip_uber_config.properties", dfs, dfsBasePath + "/config/short_trip_uber_config.properties");
|
||||
UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/clusteringjob.properties", dfs, dfsBasePath + "/clusteringjob.properties");
|
||||
UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/indexer.properties", dfs, dfsBasePath + "/indexer.properties");
|
||||
|
||||
writeCommonPropsToFile(dfs, dfsBasePath);
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ import org.apache.hudi.hive.HoodieHiveClient;
|
||||
import org.apache.hudi.keygen.SimpleKeyGenerator;
|
||||
import org.apache.hudi.utilities.DummySchemaProvider;
|
||||
import org.apache.hudi.utilities.HoodieClusteringJob;
|
||||
import org.apache.hudi.utilities.HoodieIndexer;
|
||||
import org.apache.hudi.utilities.deltastreamer.DeltaSync;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
|
||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
@@ -129,6 +130,9 @@ import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.apache.hudi.utilities.UtilHelpers.EXECUTE;
|
||||
import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE;
|
||||
import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE;
|
||||
import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY;
|
||||
import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
|
||||
import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
|
||||
@@ -397,6 +401,22 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
||||
assertTrue(minExpected <= numDeltaCommits, "Got=" + numDeltaCommits + ", exp >=" + minExpected);
|
||||
}
|
||||
|
||||
static void assertPendingIndexCommit(String tablePath, FileSystem fs) {
|
||||
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
HoodieTimeline timeline = meta.getActiveTimeline().getAllCommitsTimeline().filterPendingIndexTimeline();
|
||||
LOG.info("Timeline Instants=" + meta.getActiveTimeline().getInstants().collect(Collectors.toList()));
|
||||
int numIndexCommits = (int) timeline.getInstants().count();
|
||||
assertEquals(1, numIndexCommits, "Got=" + numIndexCommits + ", exp=1");
|
||||
}
|
||||
|
||||
static void assertCompletedIndexCommit(String tablePath, FileSystem fs) {
|
||||
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
HoodieTimeline timeline = meta.getActiveTimeline().getAllCommitsTimeline().filterCompletedIndexTimeline();
|
||||
LOG.info("Timeline Instants=" + meta.getActiveTimeline().getInstants().collect(Collectors.toList()));
|
||||
int numIndexCommits = (int) timeline.getInstants().count();
|
||||
assertEquals(1, numIndexCommits, "Got=" + numIndexCommits + ", exp=1");
|
||||
}
|
||||
|
||||
static void assertNoReplaceCommits(String tablePath, FileSystem fs) {
|
||||
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
HoodieTimeline timeline = meta.getActiveTimeline().getCompletedReplaceTimeline();
|
||||
@@ -961,6 +981,53 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
||||
return config;
|
||||
}
|
||||
|
||||
private HoodieIndexer.Config buildIndexerConfig(String basePath,
|
||||
String tableName,
|
||||
String indexInstantTime,
|
||||
String runningMode,
|
||||
String indexTypes) {
|
||||
HoodieIndexer.Config config = new HoodieIndexer.Config();
|
||||
config.basePath = basePath;
|
||||
config.tableName = tableName;
|
||||
config.indexInstantTime = indexInstantTime;
|
||||
config.propsFilePath = dfsBasePath + "/indexer.properties";
|
||||
config.runningMode = runningMode;
|
||||
config.indexTypes = indexTypes;
|
||||
return config;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHoodieIndexer() throws Exception {
|
||||
String tableBasePath = dfsBasePath + "/asyncindexer";
|
||||
HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 1000, "false");
|
||||
|
||||
deltaStreamerTestRunner(ds, (r) -> {
|
||||
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
|
||||
|
||||
Option<String> scheduleIndexInstantTime = Option.empty();
|
||||
try {
|
||||
HoodieIndexer scheduleIndexingJob = new HoodieIndexer(jsc,
|
||||
buildIndexerConfig(tableBasePath, ds.getConfig().targetTableName, null, SCHEDULE, "COLUMN_STATS"));
|
||||
scheduleIndexInstantTime = scheduleIndexingJob.doSchedule();
|
||||
} catch (Exception e) {
|
||||
LOG.info("Schedule indexing failed", e);
|
||||
return false;
|
||||
}
|
||||
if (scheduleIndexInstantTime.isPresent()) {
|
||||
TestHelpers.assertPendingIndexCommit(tableBasePath, dfs);
|
||||
LOG.info("Schedule indexing success, now build index with instant time " + scheduleIndexInstantTime.get());
|
||||
HoodieIndexer runIndexingJob = new HoodieIndexer(jsc,
|
||||
buildIndexerConfig(tableBasePath, ds.getConfig().targetTableName, scheduleIndexInstantTime.get(), EXECUTE, "COLUMN_STATS"));
|
||||
runIndexingJob.start(0);
|
||||
LOG.info("Metadata indexing success");
|
||||
TestHelpers.assertCompletedIndexCommit(tableBasePath, dfs);
|
||||
} else {
|
||||
LOG.warn("Metadata indexing failed");
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
@Disabled("HUDI-3710 to fix the ConcurrentModificationException")
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
@@ -1131,28 +1198,28 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
||||
LOG.info("Cluster success");
|
||||
} else {
|
||||
LOG.warn("Import failed");
|
||||
if (!runningMode.toLowerCase().equals(HoodieClusteringJob.EXECUTE)) {
|
||||
if (!runningMode.toLowerCase().equals(EXECUTE)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.warn("ScheduleAndExecute clustering failed", e);
|
||||
exception = e;
|
||||
if (!runningMode.equalsIgnoreCase(HoodieClusteringJob.EXECUTE)) {
|
||||
if (!runningMode.equalsIgnoreCase(EXECUTE)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (runningMode.toLowerCase()) {
|
||||
case HoodieClusteringJob.SCHEDULE_AND_EXECUTE: {
|
||||
case SCHEDULE_AND_EXECUTE: {
|
||||
TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
|
||||
return true;
|
||||
}
|
||||
case HoodieClusteringJob.SCHEDULE: {
|
||||
case SCHEDULE: {
|
||||
TestHelpers.assertAtLeastNReplaceRequests(2, tableBasePath, dfs);
|
||||
TestHelpers.assertNoReplaceCommits(tableBasePath, dfs);
|
||||
return true;
|
||||
}
|
||||
case HoodieClusteringJob.EXECUTE: {
|
||||
case EXECUTE: {
|
||||
TestHelpers.assertNoReplaceCommits(tableBasePath, dfs);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
hoodie.metadata.enable=true
|
||||
hoodie.metadata.index.async=true
|
||||
hoodie.metadata.index.column.stats.enable=true
|
||||
hoodie.metadata.index.check.timeout.seconds=60
|
||||
hoodie.write.concurrency.mode=optimistic_concurrency_control
|
||||
hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider
|
||||
Reference in New Issue
Block a user