feat(executor): 完成文件检索扫描功能
使用flink分布式扫描日志文件和数据文件,检索关键词
This commit is contained in:
@@ -1,36 +1,14 @@
|
||||
package com.lanyuanxiaoyao.service.executor.manager;
|
||||
|
||||
import cn.hutool.core.util.IdUtil;
|
||||
import com.eshore.odcp.hudi.connector.utils.executor.Runner;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.lanyuanxiaoyao.service.executor.core.TaskConstants;
|
||||
import com.lanyuanxiaoyao.service.executor.core.TaskContext;
|
||||
import com.lanyuanxiaoyao.service.executor.manager.configuration.ExecutorConfiguration;
|
||||
import com.lanyuanxiaoyao.service.executor.manager.configuration.HadoopConfiguration;
|
||||
import com.ulisesbocchio.jasyptspringboot.annotation.EnableEncryptableProperties;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import org.apache.flink.client.cli.ClientOptions;
|
||||
import org.apache.flink.configuration.*;
|
||||
import org.apache.flink.yarn.configuration.YarnConfigOptions;
|
||||
import org.apache.flink.yarn.configuration.YarnDeploymentTarget;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.eclipse.collections.api.factory.Maps;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.gson.GsonAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.cloud.client.discovery.EnableDiscoveryClient;
|
||||
import org.springframework.http.converter.json.Jackson2ObjectMapperBuilder;
|
||||
import org.springframework.retry.annotation.EnableRetry;
|
||||
|
||||
import static com.eshore.odcp.hudi.connector.Constants.HALF_HOUR;
|
||||
import static com.eshore.odcp.hudi.connector.Constants.MINUTE;
|
||||
|
||||
/**
|
||||
* @author lanyuanxiaoyao
|
||||
* @date 2023-12-04
|
||||
@@ -46,77 +24,8 @@ import static com.eshore.odcp.hudi.connector.Constants.MINUTE;
|
||||
@EnableConfigurationProperties
|
||||
@EnableEncryptableProperties
|
||||
@EnableRetry
|
||||
public class ExecutorManagerApplication implements ApplicationRunner {
|
||||
private final HadoopConfiguration hadoopConfiguration;
|
||||
private final ExecutorConfiguration executorConfiguration;
|
||||
private final ObjectMapper mapper;
|
||||
|
||||
public ExecutorManagerApplication(HadoopConfiguration hadoopConfiguration, ExecutorConfiguration executorConfiguration, Jackson2ObjectMapperBuilder builder) {
|
||||
this.hadoopConfiguration = hadoopConfiguration;
|
||||
this.executorConfiguration = executorConfiguration;
|
||||
this.mapper = builder.build();
|
||||
}
|
||||
|
||||
public class ExecutorManagerApplication {
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(ExecutorManagerApplication.class, args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) throws Exception {
|
||||
String taskId = IdUtil.nanoId(8);
|
||||
|
||||
Configuration configuration = new Configuration();
|
||||
configuration.setBoolean(JobManagerOptions.JVM_DIRECT_MEMORY_LIMIT_ENABLED, true);
|
||||
configuration.setString(AkkaOptions.ASK_TIMEOUT, "10 min");
|
||||
configuration.setString(AkkaOptions.TCP_TIMEOUT, "15 min");
|
||||
configuration.setString(AkkaOptions.LOOKUP_TIMEOUT, "10 min");
|
||||
configuration.set(ClientOptions.CLIENT_TIMEOUT, Duration.ofMinutes(30));
|
||||
// Kerberos认证
|
||||
configuration.setBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE, true);
|
||||
configuration.setString(SecurityOptions.KERBEROS_LOGIN_KEYTAB, hadoopConfiguration.getKerberosKeytabPath());
|
||||
configuration.setString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL, hadoopConfiguration.getKerberosPrincipal());
|
||||
configuration.setLong(HeartbeatManagerOptions.HEARTBEAT_INTERVAL, MINUTE);
|
||||
configuration.setLong(HeartbeatManagerOptions.HEARTBEAT_TIMEOUT, HALF_HOUR);
|
||||
configuration.setString(AkkaOptions.ASK_TIMEOUT, "1 min");
|
||||
configuration.setString(AkkaOptions.TCP_TIMEOUT, "2 min");
|
||||
configuration.setBoolean(CoreOptions.CHECK_LEAKED_CLASSLOADER, false);
|
||||
configuration.setString(YarnConfigOptions.APPLICATION_ATTEMPTS, "4");
|
||||
configuration.setString(YarnConfigOptions.STAGING_DIRECTORY, executorConfiguration.getStagingDirectory());
|
||||
configuration.setString(ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX + "MALLOC_ARENA_MAX", "1");
|
||||
configuration.setString(ResourceManagerOptions.CONTAINERIZED_TASK_MANAGER_ENV_PREFIX + "MALLOC_ARENA_MAX", "1");
|
||||
configuration.setInteger(RestOptions.PORT, 8081);
|
||||
configuration.setString(RestOptions.BIND_PORT, "8084-9400");
|
||||
configuration.setString(DeploymentOptions.TARGET, YarnDeploymentTarget.APPLICATION.getName());
|
||||
configuration.set(JobManagerOptions.TOTAL_PROCESS_MEMORY, MemorySize.parse("5120m"));
|
||||
configuration.set(JobManagerOptions.JVM_METASPACE, MemorySize.parse("128m"));
|
||||
configuration.set(TaskManagerOptions.TOTAL_FLINK_MEMORY, MemorySize.parse("1024m"));
|
||||
configuration.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("0m"));
|
||||
configuration.set(TaskManagerOptions.JVM_METASPACE, MemorySize.parse("128m"));
|
||||
configuration.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 10);
|
||||
configuration.setString(YarnConfigOptions.APPLICATION_NAME, "Service_Task_" + taskId);
|
||||
configuration.setString(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_DIRS, executorConfiguration.getHistoryServerArchiveDir());
|
||||
configuration.setLong(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_REFRESH_INTERVAL, 10000);
|
||||
|
||||
configuration.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, true);
|
||||
configuration.setString(YarnConfiguration.LOG_AGGREGATION_REMOTE_APP_LOG_DIR_FMT, "/app-logs");
|
||||
|
||||
configuration.set(PipelineOptions.JARS, new ArrayList<String>() {{
|
||||
add(executorConfiguration.getTaskJarPath());
|
||||
}});
|
||||
ApplicationId applicationId = Runner.run(
|
||||
configuration,
|
||||
"com.lanyuanxiaoyao.service.executor.task.AvroScanner",
|
||||
new String[]{
|
||||
TaskConstants.TASK_CONTEXT_OPTION,
|
||||
mapper.writeValueAsString(
|
||||
new TaskContext(
|
||||
taskId,
|
||||
executorConfiguration.getTaskResultPath(),
|
||||
Maps.mutable.of("key", "123456", "hdfs", "hdfs://b2/apps/datalake/hive/dws_test/external_table_hudi/dws_ord_prod_inst_attr")
|
||||
)
|
||||
)
|
||||
}
|
||||
);
|
||||
System.out.println(applicationId);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
package com.lanyuanxiaoyao.service.executor.manager.controller;
|
||||
|
||||
import com.lanyuanxiaoyao.service.executor.manager.service.TaskService;
|
||||
import java.io.IOException;
|
||||
import org.eclipse.collections.api.list.ImmutableList;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
/**
|
||||
* @author lanyuanxiaoyao
|
||||
* @date 2024-01-10
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("task")
|
||||
public class TaskController {
|
||||
private static final Logger logger = LoggerFactory.getLogger(TaskController.class);
|
||||
|
||||
private final TaskService taskService;
|
||||
|
||||
public TaskController(TaskService taskService) {
|
||||
this.taskService = taskService;
|
||||
}
|
||||
|
||||
@GetMapping("scan")
|
||||
public String scan(
|
||||
@RequestParam("hdfs") String hdfs,
|
||||
@RequestParam("key") String key,
|
||||
@RequestParam(value = "scan_log", defaultValue = "true") Boolean scanLog,
|
||||
@RequestParam(value = "scan_data", defaultValue = "false") Boolean scanData,
|
||||
@RequestParam(value = "scan_source", defaultValue = "false") Boolean scanSource,
|
||||
@RequestParam(value = "scan_target", defaultValue = "false") Boolean scanTarget
|
||||
) throws Exception {
|
||||
return taskService.scanAvro(hdfs, key, scanLog, scanData, scanSource, scanTarget);
|
||||
}
|
||||
|
||||
@GetMapping("results")
|
||||
public ImmutableList<String> results(
|
||||
@RequestParam("task_id") String taskId,
|
||||
@RequestParam(value = "limit", defaultValue = "1000") Integer limit
|
||||
) throws IOException {
|
||||
return taskService.taskResult(taskId, limit);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package com.lanyuanxiaoyao.service.executor.manager.service;
|
||||
|
||||
import cn.hutool.core.io.IoUtil;
|
||||
import cn.hutool.core.util.IdUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import com.eshore.odcp.hudi.connector.utils.executor.Runner;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.lanyuanxiaoyao.service.executor.core.TaskConstants;
|
||||
import com.lanyuanxiaoyao.service.executor.core.TaskContext;
|
||||
import com.lanyuanxiaoyao.service.executor.manager.configuration.ExecutorConfiguration;
|
||||
import com.lanyuanxiaoyao.service.executor.manager.configuration.HadoopConfiguration;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import org.apache.flink.client.cli.ClientOptions;
|
||||
import org.apache.flink.configuration.*;
|
||||
import org.apache.flink.yarn.configuration.YarnConfigOptions;
|
||||
import org.apache.flink.yarn.configuration.YarnDeploymentTarget;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.eclipse.collections.api.factory.Lists;
|
||||
import org.eclipse.collections.api.factory.Maps;
|
||||
import org.eclipse.collections.api.list.ImmutableList;
|
||||
import org.eclipse.collections.api.list.MutableList;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.cache.annotation.Cacheable;
|
||||
import org.springframework.http.converter.json.Jackson2ObjectMapperBuilder;
|
||||
import org.springframework.retry.annotation.Retryable;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import static com.eshore.odcp.hudi.connector.Constants.HALF_HOUR;
|
||||
import static com.eshore.odcp.hudi.connector.Constants.MINUTE;
|
||||
|
||||
/**
|
||||
* @author lanyuanxiaoyao
|
||||
* @date 2024-01-10
|
||||
*/
|
||||
@Service
|
||||
public class TaskService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(TaskService.class);
|
||||
|
||||
private final HadoopConfiguration hadoopConfiguration;
|
||||
private final ExecutorConfiguration executorConfiguration;
|
||||
private final ObjectMapper mapper;
|
||||
|
||||
public TaskService(HadoopConfiguration hadoopConfiguration, ExecutorConfiguration executorConfiguration, Jackson2ObjectMapperBuilder builder) {
|
||||
this.hadoopConfiguration = hadoopConfiguration;
|
||||
this.executorConfiguration = executorConfiguration;
|
||||
this.mapper = builder.build();
|
||||
}
|
||||
|
||||
private String taskId() {
|
||||
return IdUtil.nanoId(8);
|
||||
}
|
||||
|
||||
private Configuration generateConfiguration(String taskId, String name) {
|
||||
Configuration configuration = new Configuration();
|
||||
configuration.setBoolean(JobManagerOptions.JVM_DIRECT_MEMORY_LIMIT_ENABLED, true);
|
||||
configuration.setString(AkkaOptions.ASK_TIMEOUT, "10 min");
|
||||
configuration.setString(AkkaOptions.TCP_TIMEOUT, "15 min");
|
||||
configuration.setString(AkkaOptions.LOOKUP_TIMEOUT, "10 min");
|
||||
configuration.set(ClientOptions.CLIENT_TIMEOUT, Duration.ofMinutes(30));
|
||||
// Kerberos认证
|
||||
configuration.setBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE, true);
|
||||
configuration.setString(SecurityOptions.KERBEROS_LOGIN_KEYTAB, hadoopConfiguration.getKerberosKeytabPath());
|
||||
configuration.setString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL, hadoopConfiguration.getKerberosPrincipal());
|
||||
configuration.setLong(HeartbeatManagerOptions.HEARTBEAT_INTERVAL, MINUTE);
|
||||
configuration.setLong(HeartbeatManagerOptions.HEARTBEAT_TIMEOUT, HALF_HOUR);
|
||||
configuration.setString(AkkaOptions.ASK_TIMEOUT, "1 min");
|
||||
configuration.setString(AkkaOptions.TCP_TIMEOUT, "2 min");
|
||||
configuration.setBoolean(CoreOptions.CHECK_LEAKED_CLASSLOADER, false);
|
||||
configuration.setString(YarnConfigOptions.APPLICATION_ATTEMPTS, "4");
|
||||
configuration.setString(YarnConfigOptions.STAGING_DIRECTORY, executorConfiguration.getStagingDirectory());
|
||||
configuration.setString(ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX + "MALLOC_ARENA_MAX", "1");
|
||||
configuration.setString(ResourceManagerOptions.CONTAINERIZED_TASK_MANAGER_ENV_PREFIX + "MALLOC_ARENA_MAX", "1");
|
||||
configuration.setInteger(RestOptions.PORT, 8081);
|
||||
configuration.setString(RestOptions.BIND_PORT, "8084-9400");
|
||||
configuration.setString(DeploymentOptions.TARGET, YarnDeploymentTarget.APPLICATION.getName());
|
||||
configuration.set(JobManagerOptions.TOTAL_PROCESS_MEMORY, MemorySize.parse("2048m"));
|
||||
configuration.set(JobManagerOptions.JVM_METASPACE, MemorySize.parse("128m"));
|
||||
configuration.set(TaskManagerOptions.TOTAL_FLINK_MEMORY, MemorySize.parse("10240m"));
|
||||
configuration.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("0m"));
|
||||
configuration.set(TaskManagerOptions.JVM_METASPACE, MemorySize.parse("128m"));
|
||||
configuration.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 10);
|
||||
configuration.setString(YarnConfigOptions.APPLICATION_NAME, "Service_Task_" + name + "_" + taskId);
|
||||
configuration.setString(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_DIRS, executorConfiguration.getHistoryServerArchiveDir());
|
||||
configuration.setLong(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_REFRESH_INTERVAL, 10000);
|
||||
|
||||
configuration.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, true);
|
||||
configuration.setString(YarnConfiguration.LOG_AGGREGATION_REMOTE_APP_LOG_DIR_FMT, "/app-logs");
|
||||
|
||||
configuration.set(PipelineOptions.JARS, new ArrayList<String>() {{
|
||||
add(executorConfiguration.getTaskJarPath());
|
||||
}});
|
||||
return configuration;
|
||||
}
|
||||
|
||||
public String scanAvro(String hdfs, String key, Boolean scanLog, Boolean scanData, Boolean scanSource, Boolean scanTarget) throws Exception {
|
||||
String taskId = taskId();
|
||||
ApplicationId applicationId = Runner.run(
|
||||
generateConfiguration(taskId, "scan"),
|
||||
"com.lanyuanxiaoyao.service.executor.task.DataScanner",
|
||||
new String[]{
|
||||
TaskConstants.TASK_CONTEXT_OPTION,
|
||||
mapper.writeValueAsString(
|
||||
new TaskContext(
|
||||
taskId,
|
||||
executorConfiguration.getTaskResultPath(),
|
||||
Maps.mutable.of(
|
||||
"key",
|
||||
key,
|
||||
"hdfs",
|
||||
hdfs,
|
||||
"scan_log",
|
||||
scanLog,
|
||||
"scan_data",
|
||||
scanData
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
);
|
||||
return applicationId.toString();
|
||||
}
|
||||
|
||||
@Cacheable(value = "results", sync = true)
|
||||
@Retryable(Throwable.class)
|
||||
public ImmutableList<String> taskResult(String taskId, Integer limit) throws IOException {
|
||||
Path resultPath = new Path(executorConfiguration.getTaskResultPath(), taskId);
|
||||
MutableList<String> results = Lists.mutable.empty();
|
||||
try (FileSystem fileSystem = FileSystem.get(new org.apache.hadoop.conf.Configuration())) {
|
||||
if (!fileSystem.exists(resultPath)) {
|
||||
throw new RuntimeException(StrUtil.format("Task {} result not found", taskId));
|
||||
}
|
||||
for (FileStatus status : fileSystem.listStatus(resultPath)) {
|
||||
if (status.isFile() && results.size() < limit) {
|
||||
try (FSDataInputStream file = fileSystem.open(status.getPath())) {
|
||||
IoUtil.readLines(file, Charset.defaultCharset(), results);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return results
|
||||
.reject(StrUtil::isBlank)
|
||||
.collect(StrUtil::trim)
|
||||
.toImmutable();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user