[HUDI-2164] Let users build cluster plan and execute this plan at once using HoodieClusteringJob for async clustering (#3259)
* add --mode schedule/execute/scheduleandexecute * fix checkstyle * add UT testHoodieAsyncClusteringJobWithScheduleAndExecute * log changed * try to make ut success * try to fix ut * modify ut * review changed * code review * code review * code review * code review Co-authored-by: yuezhang <yuezhang@freewheel.tv>
This commit is contained in:
@@ -18,6 +18,10 @@
|
||||
|
||||
package org.apache.hudi.utilities.functional;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.util.ConcurrentModificationException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import org.apache.hudi.common.config.DFSPropertiesConfiguration;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
@@ -77,22 +81,22 @@ import org.apache.spark.sql.api.java.UDF4;
|
||||
import org.apache.spark.sql.functions;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.ConcurrentModificationException;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -103,6 +107,7 @@ import java.util.stream.Stream;
|
||||
import static org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
@@ -142,6 +147,38 @@ public class TestHoodieDeltaStreamer extends TestHoodieDeltaStreamerBase {
|
||||
return props;
|
||||
}
|
||||
|
||||
protected HoodieDeltaStreamer initialHoodieDeltaStreamer(String tableBasePath, int totalRecords, String asyncCluster) throws IOException {
|
||||
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
|
||||
cfg.continuousMode = true;
|
||||
cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", asyncCluster, ""));
|
||||
return new HoodieDeltaStreamer(cfg, jsc);
|
||||
}
|
||||
|
||||
protected HoodieClusteringJob initialHoodieClusteringJob(String tableBasePath, String clusteringInstantTime, boolean runSchedule, String scheduleAndExecute) {
|
||||
HoodieClusteringJob.Config scheduleClusteringConfig = buildHoodieClusteringUtilConfig(tableBasePath,
|
||||
clusteringInstantTime, runSchedule, scheduleAndExecute);
|
||||
return new HoodieClusteringJob(jsc, scheduleClusteringConfig);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
if (testUtils != null) {
|
||||
testUtils.teardown();
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
}
|
||||
|
||||
static class TestHelpers {
|
||||
|
||||
static HoodieDeltaStreamer.Config makeDropAllConfig(String basePath, WriteOperationType op) {
|
||||
@@ -317,6 +354,22 @@ public class TestHoodieDeltaStreamer extends TestHoodieDeltaStreamerBase {
|
||||
int numDeltaCommits = (int) timeline.getInstants().count();
|
||||
assertTrue(minExpected <= numDeltaCommits, "Got=" + numDeltaCommits + ", exp >=" + minExpected);
|
||||
}
|
||||
|
||||
static void assertNoReplaceCommits(int expected, String tablePath, FileSystem fs) {
|
||||
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
HoodieTimeline timeline = meta.getActiveTimeline().getCompletedReplaceTimeline();
|
||||
LOG.info("Timeline Instants=" + meta.getActiveTimeline().getInstants().collect(Collectors.toList()));
|
||||
int numDeltaCommits = (int) timeline.getInstants().count();
|
||||
assertEquals(expected, numDeltaCommits, "Got=" + numDeltaCommits + ", exp =" + expected);
|
||||
}
|
||||
|
||||
static void assertAtLeastNReplaceRequests(int minExpected, String tablePath, FileSystem fs) {
|
||||
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
HoodieTimeline timeline = meta.getActiveTimeline().filterPendingReplaceTimeline();
|
||||
LOG.info("Timeline Instants=" + meta.getActiveTimeline().getInstants().collect(Collectors.toList()));
|
||||
int numDeltaCommits = (int) timeline.getInstants().count();
|
||||
assertTrue(minExpected <= numDeltaCommits, "Got=" + numDeltaCommits + ", exp >=" + minExpected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -794,6 +847,10 @@ public class TestHoodieDeltaStreamer extends TestHoodieDeltaStreamerBase {
|
||||
dsFuture.get();
|
||||
}
|
||||
|
||||
private void deltaStreamerTestRunner(HoodieDeltaStreamer ds, Function<Boolean, Boolean> condition) throws Exception {
|
||||
deltaStreamerTestRunner(ds, null, condition);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInlineClustering() throws Exception {
|
||||
String tableBasePath = dfsBasePath + "/inlineClustering";
|
||||
@@ -836,32 +893,34 @@ public class TestHoodieDeltaStreamer extends TestHoodieDeltaStreamerBase {
|
||||
}
|
||||
|
||||
private HoodieClusteringJob.Config buildHoodieClusteringUtilConfig(String basePath,
|
||||
String clusteringInstantTime, boolean runSchedule) {
|
||||
String clusteringInstantTime,
|
||||
boolean runSchedule) {
|
||||
return buildHoodieClusteringUtilConfig(basePath, clusteringInstantTime, runSchedule, null);
|
||||
}
|
||||
|
||||
private HoodieClusteringJob.Config buildHoodieClusteringUtilConfig(String basePath,
|
||||
String clusteringInstantTime,
|
||||
boolean runSchedule,
|
||||
String runningMode) {
|
||||
HoodieClusteringJob.Config config = new HoodieClusteringJob.Config();
|
||||
config.basePath = basePath;
|
||||
config.clusteringInstantTime = clusteringInstantTime;
|
||||
config.runSchedule = runSchedule;
|
||||
config.propsFilePath = dfsBasePath + "/clusteringjob.properties";
|
||||
config.runningMode = runningMode;
|
||||
return config;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHoodieAsyncClusteringJob() throws Exception {
|
||||
String tableBasePath = dfsBasePath + "/asyncClustering";
|
||||
// Keep it higher than batch-size to test continuous mode
|
||||
int totalRecords = 3000;
|
||||
|
||||
// Initial bulk insert
|
||||
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
|
||||
cfg.continuousMode = true;
|
||||
cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", ""));
|
||||
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
|
||||
deltaStreamerTestRunner(ds, cfg, (r) -> {
|
||||
HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "true");
|
||||
HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, null);
|
||||
|
||||
deltaStreamerTestRunner(ds, (r) -> {
|
||||
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
|
||||
HoodieClusteringJob.Config scheduleClusteringConfig = buildHoodieClusteringUtilConfig(tableBasePath,
|
||||
null, true);
|
||||
HoodieClusteringJob scheduleClusteringJob = new HoodieClusteringJob(jsc, scheduleClusteringConfig);
|
||||
|
||||
Option<String> scheduleClusteringInstantTime = Option.empty();
|
||||
try {
|
||||
scheduleClusteringInstantTime = scheduleClusteringJob.doSchedule();
|
||||
@@ -923,6 +982,52 @@ public class TestHoodieDeltaStreamer extends TestHoodieDeltaStreamerBase {
|
||||
});
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(strings = {"schedule", "execute", "scheduleAndExecute"})
|
||||
public void testHoodieAsyncClusteringJobWithScheduleAndExecute(String runningMode) throws Exception {
|
||||
String tableBasePath = dfsBasePath + "/asyncClustering2";
|
||||
HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "false");
|
||||
HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, runningMode);
|
||||
|
||||
deltaStreamerTestRunner(ds, (r) -> {
|
||||
Exception exception = null;
|
||||
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
|
||||
try {
|
||||
int result = scheduleClusteringJob.cluster(0);
|
||||
if (result == 0) {
|
||||
LOG.info("Cluster success");
|
||||
} else {
|
||||
LOG.warn("Import failed");
|
||||
return false;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.warn("ScheduleAndExecute clustering failed", e);
|
||||
exception = e;
|
||||
if (!runningMode.equalsIgnoreCase(HoodieClusteringJob.EXECUTE)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (runningMode.toLowerCase()) {
|
||||
case HoodieClusteringJob.SCHEDULE_AND_EXECUTE: {
|
||||
TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
|
||||
return true;
|
||||
}
|
||||
case HoodieClusteringJob.SCHEDULE: {
|
||||
TestHelpers.assertAtLeastNReplaceRequests(2, tableBasePath, dfs);
|
||||
TestHelpers.assertNoReplaceCommits(0, tableBasePath, dfs);
|
||||
return true;
|
||||
}
|
||||
case HoodieClusteringJob.EXECUTE: {
|
||||
assertNotNull(exception);
|
||||
assertEquals(exception.getMessage(), "--instant-time couldn't be null when executing clustering plan.");
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
throw new IllegalStateException("Unexpected value: " + runningMode);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline The first
|
||||
* step involves using a SQL template to transform a source TEST-DATA-SOURCE ============================> HUDI TABLE
|
||||
|
||||
Reference in New Issue
Block a user