feat(executor-task): 数据扫描增加pulsar队列读取

This commit is contained in:
2024-01-19 14:36:41 +08:00
parent 521e82104f
commit 9140a39bf1
22 changed files with 796 additions and 105 deletions

View File

@@ -10,14 +10,18 @@ public class TableInfoSearchCache {
private Long flinkJobId;
private String alias;
private String hdfs;
private String pulsar;
private String topic;
public TableInfoSearchCache() {
}
public TableInfoSearchCache(Long flinkJobId, String alias, String hdfs) {
public TableInfoSearchCache(Long flinkJobId, String alias, String hdfs, String pulsar, String topic) {
this.flinkJobId = flinkJobId;
this.alias = alias;
this.hdfs = hdfs;
this.pulsar = pulsar;
this.topic = topic;
}
public Long getFlinkJobId() {
@@ -44,12 +48,30 @@ public class TableInfoSearchCache {
this.hdfs = hdfs;
}
public String getPulsar() {
return pulsar;
}
public void setPulsar(String pulsar) {
this.pulsar = pulsar;
}
public String getTopic() {
return topic;
}
public void setTopic(String topic) {
this.topic = topic;
}
@Override
public String toString() {
return "TableInfoSearchCache{" +
"flinkJobId=" + flinkJobId +
", alias='" + alias + '\'' +
", hdfs='" + hdfs + '\'' +
'}';
"flinkJobId=" + flinkJobId +
", alias='" + alias + '\'' +
", hdfs='" + hdfs + '\'' +
", pulsar='" + pulsar + '\'' +
", topic='" + topic + '\'' +
'}';
}
}

View File

@@ -89,6 +89,11 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.pulsar</groupId>
<artifactId>pulsar-client</artifactId>
<version>2.8.0</version>
</dependency>
</dependencies>
<build>

View File

@@ -1,5 +1,6 @@
package com.lanyuanxiaoyao.service.executor.manager.controller;
import cn.hutool.core.util.StrUtil;
import com.lanyuanxiaoyao.service.executor.manager.service.TaskService;
import java.io.IOException;
import org.eclipse.collections.api.list.ImmutableList;
@@ -27,14 +28,27 @@ public class TaskController {
@GetMapping("scan")
public String scan(
@RequestParam("hdfs") String hdfs,
@RequestParam("key") String key,
@RequestParam(value = "scan_log", defaultValue = "true") Boolean scanLog,
@RequestParam(value = "scan_data", defaultValue = "false") Boolean scanData,
@RequestParam(value = "hdfs", required = false) String hdfs,
@RequestParam(value = "pulsar", required = false) String pulsar,
@RequestParam(value = "pulsar_topic", required = false) String pulsarTopic,
@RequestParam(value = "scan_source", defaultValue = "false") Boolean scanSource,
@RequestParam(value = "scan_queue", defaultValue = "false") Boolean scanQueue,
@RequestParam(value = "scan_log", defaultValue = "false") Boolean scanLog,
@RequestParam(value = "scan_base", defaultValue = "false") Boolean scanBase,
@RequestParam(value = "scan_target", defaultValue = "false") Boolean scanTarget
) throws Exception {
return taskService.scanAvro(hdfs, key, scanLog, scanData, scanSource, scanTarget);
logger.info("Enter method: scan[key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget]. " + "key:" + key + "," + "hdfs:" + hdfs + "," + "pulsar:" + pulsar + "," + "pulsarTopic:" + pulsarTopic + "," + "scanSource:" + scanSource + "," + "scanQueue:" + scanQueue + "," + "scanLog:" + scanLog + "," + "scanBase:" + scanBase + "," + "scanTarget:" + scanTarget);
if (!scanSource && !scanQueue && !scanLog && !scanBase && !scanTarget) {
throw new RuntimeException("Must choose one mode");
}
if (scanQueue && (StrUtil.isBlank(pulsar) || StrUtil.isBlank(pulsar))) {
throw new RuntimeException("Pulsar topic or url cannot be empty");
}
if ((scanLog || scanBase) && StrUtil.isBlank(hdfs)) {
throw new RuntimeException("Hdfs path cannot be empty");
}
return taskService.scanAvro(key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget);
}
@GetMapping("results")

View File

@@ -1,6 +1,8 @@
package com.lanyuanxiaoyao.service.executor.manager.service;
import cn.hutool.core.io.IoUtil;
import cn.hutool.core.map.MapBuilder;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.StrUtil;
import com.eshore.odcp.hudi.connector.utils.executor.Runner;
@@ -108,11 +110,38 @@ public class TaskService {
return configuration;
}
public String scanAvro(String hdfs, String key, Boolean scanLog, Boolean scanData, Boolean scanSource, Boolean scanTarget) throws Exception {
public String scanAvro(
String key,
String hdfs,
String pulsar,
String pulsarTopic,
Boolean scanSource,
Boolean scanQueue,
Boolean scanLog,
Boolean scanBase,
Boolean scanTarget
) throws Exception {
String taskId = taskId();
Configuration configuration = generateConfiguration(taskId, "scan");
setEnvironment(configuration, "hdfs", hdfs);
MapBuilder<String, Object> builder = MapUtil.builder();
setEnvironment(configuration, "key", key);
builder.put("key", key);
if (scanLog || scanBase) {
setEnvironment(configuration, "hdfs", hdfs);
builder.put("scan_log", scanLog);
builder.put("scan_base", scanBase);
builder.put("hdfs", hdfs);
}
if (scanQueue) {
setEnvironment(configuration, "pulsar", pulsar);
setEnvironment(configuration, "pulsar_topic", pulsarTopic);
builder.put("scan_queue", true);
builder.put("pulsar", pulsar);
builder.put("pulsar_topic", pulsarTopic);
}
ApplicationId applicationId = Runner.run(
configuration,
"com.lanyuanxiaoyao.service.executor.task.DataScanner",
@@ -122,16 +151,7 @@ public class TaskService {
new TaskContext(
taskId,
executorConfiguration.getTaskResultPath(),
Maps.mutable.of(
"key",
key,
"hdfs",
hdfs,
"scan_log",
scanLog,
"scan_data",
scanData
)
Maps.mutable.ofMap(builder.build())
)
)
}

View File

@@ -2,7 +2,7 @@ spring:
application:
name: service-executor-manager
profiles:
include: random-port,common,discovery,metrics
include: random-port,common,discovery,metrics,forest
executor:
staging-directory: hdfs://b2/apps/datalake/yarn
history-server-archive-dir: hdfs://b2/apps/flink/completed-jobs/

View File

@@ -77,6 +77,12 @@
<version>10.4.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.pulsar</groupId>
<artifactId>pulsar-client</artifactId>
<version>2.8.0</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>

View File

@@ -6,11 +6,13 @@ import com.lanyuanxiaoyao.service.configuration.ExecutorProvider;
import com.lanyuanxiaoyao.service.executor.core.TaskContext;
import com.lanyuanxiaoyao.service.executor.task.entity.RecordView;
import com.lanyuanxiaoyao.service.executor.task.functions.ReadHudiFile;
import com.lanyuanxiaoyao.service.executor.task.functions.pulsar.ReadPulsarSource;
import com.lanyuanxiaoyao.service.executor.task.helper.ArgumentsHelper;
import com.lanyuanxiaoyao.service.executor.task.helper.FlinkHelper;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.hadoop.conf.Configuration;
@@ -67,70 +69,92 @@ public class DataScanner {
logger.info("Context: {}", taskContext);
Map<String, Object> metadata = taskContext.getMetadata();
ArgumentsHelper.checkMetadata(taskContext, "hdfs");
String hdfs = (String) metadata.get("hdfs");
ArgumentsHelper.checkMetadata(taskContext, "key");
String key = (String) metadata.get("key");
Boolean scanLog = (Boolean) metadata.getOrDefault("scan_log", true);
Boolean scanData = (Boolean) metadata.getOrDefault("scan_data", false);
if (!scanLog && !scanData) {
throw new RuntimeException("Must choose mode scan_log or scan_data");
}
Boolean scanQueue = (Boolean) metadata.getOrDefault("scan_queue", false);
Boolean scanLog = (Boolean) metadata.getOrDefault("scan_log", false);
Boolean scanBase = (Boolean) metadata.getOrDefault("scan_base", false);
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
if (!fileSystem.exists(new Path(hdfs))) {
throw new RuntimeException(StrUtil.format("HDFS {} is not exists", hdfs));
if (!scanQueue && !scanLog && !scanBase) {
throw new RuntimeException("Must choose mode scan_queue or scan_log or scan_data");
}
ImmutableList<Path> paths = Lists.immutable.of(fileSystem.listStatus(new Path(hdfs)))
.reject(status -> StrUtil.equals(".hoodie", status.getPath().getName()))
.flatCollect(status -> {
try {
if (status.isDirectory()) {
return Lists.immutable.of(fileSystem.listStatus(status.getPath()));
} else {
return Lists.immutable.of(status);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
})
.collect(FileStatus::getPath);
StreamExecutionEnvironment environment = FlinkHelper.getBatchEnvironment();
DataStream<RecordView> source = null;
int totalParallelism = 20;
if (scanLog) {
ImmutableList<String> logPaths = paths.select(FSUtils::isLogFile).collect(Path::toString);
int parallelism = Math.max(1, Math.min(logPaths.size() / 20, 100));
totalParallelism = Math.max(totalParallelism, parallelism);
source = environment
.fromCollection(logPaths.toList())
.name("Read log paths")
.flatMap(new ReadHudiFile())
.name("Read hudi file")
.setParallelism(parallelism);
}
if (scanData) {
ImmutableList<String> dataPaths = parsePaths(fileSystem, paths.select(FSUtils::isBaseFile));
int parallelism = Math.max(1, Math.min(dataPaths.size() / 2, 500));
totalParallelism = Math.max(totalParallelism, parallelism);
if (scanQueue) {
ArgumentsHelper.checkMetadata(taskContext, "pulsar");
String pulsarUrl = (String) metadata.get("pulsar");
ArgumentsHelper.checkMetadata(taskContext, "pulsar_topic");
String pulsarTopic = (String) metadata.get("pulsar_topic");
logger.info("Scan queue topic: {} url: {}", pulsarTopic, pulsarUrl);
DataStream<RecordView> stream = environment
.fromSource(new ReadPulsarSource(taskContext, pulsarUrl, pulsarTopic, 50), WatermarkStrategy.noWatermarks(), "Read pulsar")
.setParallelism(50)
.disableChaining();
if (ObjectUtil.isNull(source)) {
source = environment
source = stream;
} else {
source = source.union(stream);
}
}
if (scanLog || scanBase) {
ArgumentsHelper.checkMetadata(taskContext, "hdfs");
String hdfs = (String) metadata.get("hdfs");
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
if (!fileSystem.exists(new Path(hdfs))) {
throw new RuntimeException(StrUtil.format("HDFS {} is not exists", hdfs));
}
ImmutableList<Path> paths = Lists.immutable.of(fileSystem.listStatus(new Path(hdfs)))
.reject(status -> StrUtil.equals(".hoodie", status.getPath().getName()))
.flatCollect(status -> {
try {
if (status.isDirectory()) {
return Lists.immutable.of(fileSystem.listStatus(status.getPath()));
} else {
return Lists.immutable.of(status);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
})
.collect(FileStatus::getPath);
if (scanLog) {
logger.info("Scan log hdfs: {}", hdfs);
ImmutableList<String> logPaths = paths.select(FSUtils::isLogFile).collect(Path::toString);
int parallelism = Math.max(1, Math.min(logPaths.size() / 20, 100));
totalParallelism = Math.max(totalParallelism, parallelism);
DataStream<RecordView> stream = environment
.fromCollection(logPaths.toList())
.name("Read log paths")
.flatMap(new ReadHudiFile())
.name("Read hudi file")
.setParallelism(parallelism);
if (ObjectUtil.isNull(source)) {
source = stream;
} else {
source = source.union(stream);
}
}
if (scanBase) {
logger.info("Scan base hdfs: {}", hdfs);
ImmutableList<String> dataPaths = parsePaths(fileSystem, paths.select(FSUtils::isBaseFile));
int parallelism = Math.max(1, Math.min(dataPaths.size() / 2, 500));
totalParallelism = Math.max(totalParallelism, parallelism);
DataStream<RecordView> stream = environment
.fromCollection(dataPaths.toList())
.name("Read base paths")
.flatMap(new ReadHudiFile())
.name("Read hudi file")
.setParallelism(parallelism);
} else {
source = source.union(environment
.fromCollection(dataPaths.toList())
.name("Read base paths")
.flatMap(new ReadHudiFile())
.name("Read hudi file")
.setParallelism(parallelism));
if (ObjectUtil.isNull(source)) {
source = stream;
} else {
source = source.union(stream);
}
}
}
if (ObjectUtil.isNull(source)) {
@@ -147,6 +171,6 @@ public class DataScanner {
.sinkTo(FlinkHelper.createFileSink(taskContext))
.setParallelism(10)
.name("Output results");
environment.execute(StrUtil.format("Search {} in {}", key, hdfs));
environment.execute();
}
}

View File

@@ -12,11 +12,13 @@ import java.util.Map;
* @date 2024-01-09
*/
public class RecordView implements Serializable, Comparable<RecordView> {
private final Operation operation;
private final String data;
private final String timestamp;
private final String file;
private final Map<String, Object> attributes;
private Operation operation;
private String data;
private String timestamp;
private String file;
private Map<String, Object> attributes;
public RecordView() {}
public RecordView(Operation operation, String data, String timestamp, String file) {
this.operation = operation;
@@ -30,22 +32,42 @@ public class RecordView implements Serializable, Comparable<RecordView> {
return operation;
}
public void setOperation(Operation operation) {
this.operation = operation;
}
public String getData() {
return data;
}
public void setData(String data) {
this.data = data;
}
public String getTimestamp() {
return timestamp;
}
public void setTimestamp(String timestamp) {
this.timestamp = timestamp;
}
public String getFile() {
return file;
}
public void setFile(String file) {
this.file = file;
}
public Map<String, Object> getAttributes() {
return attributes;
}
public void setAttributes(Map<String, Object> attributes) {
this.attributes = attributes;
}
@Override
public String toString() {
return StrUtil.format("{} {} {} {}", operation, timestamp, file, data);

View File

@@ -0,0 +1,110 @@
package com.lanyuanxiaoyao.service.executor.task.functions.pulsar;
import cn.hutool.core.util.StrUtil;
import com.lanyuanxiaoyao.service.executor.core.TaskContext;
import com.lanyuanxiaoyao.service.executor.task.entity.RecordView;
import java.io.Serializable;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.connector.source.*;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.pulsar.client.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author lanyuanxiaoyao
* @date 2024-01-18
*/
public class ReadPulsarSource implements Source<RecordView, ReadPulsarSplit, Collection<ReadPulsarSplit>>, ResultTypeQueryable<RecordView>, Serializable {
private static final Logger logger = LoggerFactory.getLogger(ReadPulsarSource.class);
private static final Long TASK_GAP = 6 * 60 * 60 * 1000L;
private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS")
.withLocale(Locale.CHINA)
.withZone(ZoneId.systemDefault());
private final Collection<ReadPulsarSplit> splits;
public ReadPulsarSource(TaskContext taskContext, String pulsarUrl, String pulsarTopic, Integer parallelism) throws PulsarClientException {
try (PulsarClient client = PulsarClient.builder()
.serviceUrl(pulsarUrl)
.build()) {
try (Consumer<byte[]> consumer = client.newConsumer()
.topic(pulsarTopic)
.subscriptionInitialPosition(SubscriptionInitialPosition.Earliest)
.subscriptionMode(SubscriptionMode.NonDurable)
.subscriptionType(SubscriptionType.Exclusive)
.subscriptionName(StrUtil.format("Task_Reader_Detect_{}", taskContext.getTaskId()))
.startMessageIdInclusive()
.subscribe()) {
MessageId latestMessageId = consumer.getLastMessageId();
Message<byte[]> message = consumer.receive();
long startTimestamp = message.getPublishTime();
long endTimestamp = Instant.now().toEpochMilli();
long gap = Math.max((endTimestamp - startTimestamp) / (parallelism - 1), 1000 * 60 * 60);
logger.info("Gap: {}, Parallelism: {}", gap, parallelism);
List<ReadPulsarSplit> tasks = new ArrayList<>();
while (startTimestamp < endTimestamp) {
tasks.add(new ReadPulsarSplit(
taskContext.getTaskId(),
pulsarUrl,
pulsarTopic,
latestMessageId.toString(),
startTimestamp,
startTimestamp + gap
));
startTimestamp += gap;
}
splits = tasks;
for (ReadPulsarSplit split : splits) {
logger.info("Read split: {} -> {}", covertTimestamp(split.getStartTimestamp()), covertTimestamp(split.getEndTimestamp()));
}
}
}
}
private static String covertTimestamp(Long timestamp) {
return FORMATTER.format(Instant.ofEpochMilli(timestamp));
}
@Override
public Boundedness getBoundedness() {
return Boundedness.BOUNDED;
}
@Override
public SourceReader<RecordView, ReadPulsarSplit> createReader(SourceReaderContext readerContext) throws PulsarClientException {
return new ReadPulsarSourceReader(readerContext);
}
@Override
public SplitEnumerator<ReadPulsarSplit, Collection<ReadPulsarSplit>> createEnumerator(SplitEnumeratorContext<ReadPulsarSplit> enumContext) throws Exception {
return new ReadPulsarSourceEnumerator(enumContext, splits);
}
@Override
public SplitEnumerator<ReadPulsarSplit, Collection<ReadPulsarSplit>> restoreEnumerator(SplitEnumeratorContext<ReadPulsarSplit> enumContext, Collection<ReadPulsarSplit> checkpoint) throws Exception {
return new ReadPulsarSourceEnumerator(enumContext, checkpoint);
}
@Override
public SimpleVersionedSerializer<ReadPulsarSplit> getSplitSerializer() {
return new ReadPulsarVersionedSplitSerializer();
}
@Override
public SimpleVersionedSerializer<Collection<ReadPulsarSplit>> getEnumeratorCheckpointSerializer() {
return new ReadPulsarVersionedCheckpointSerializer();
}
@Override
public TypeInformation<RecordView> getProducedType() {
return TypeInformation.of(RecordView.class);
}
}

View File

@@ -0,0 +1,63 @@
package com.lanyuanxiaoyao.service.executor.task.functions.pulsar;
import cn.hutool.core.util.ObjectUtil;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.List;
import java.util.Queue;
import javax.annotation.Nullable;
import org.apache.flink.api.connector.source.SplitEnumerator;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author lanyuanxiaoyao
* @date 2024-01-18
*/
public class ReadPulsarSourceEnumerator implements SplitEnumerator<ReadPulsarSplit, Collection<ReadPulsarSplit>>, Serializable {
private static final Logger logger = LoggerFactory.getLogger(ReadPulsarSourceEnumerator.class);
private final SplitEnumeratorContext<ReadPulsarSplit> context;
private final Queue<ReadPulsarSplit> readQueue;
public ReadPulsarSourceEnumerator(SplitEnumeratorContext<ReadPulsarSplit> context, Collection<ReadPulsarSplit> splits) {
this.context = context;
this.readQueue = new ArrayDeque<>(splits);
}
@Override
public void start() {
}
@Override
public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) {
final ReadPulsarSplit split = readQueue.poll();
if (ObjectUtil.isNotNull(split)) {
logger.info("Assign split for {}, split: {}", subtaskId, split);
context.assignSplit(split, subtaskId);
} else {
logger.info("No more split for {}", subtaskId);
context.signalNoMoreSplits(subtaskId);
}
}
@Override
public void addSplitsBack(List<ReadPulsarSplit> splits, int subtaskId) {
readQueue.addAll(splits);
}
@Override
public void addReader(int subtaskId) {
}
@Override
public Collection<ReadPulsarSplit> snapshotState(long checkpointId) throws Exception {
return readQueue;
}
@Override
public void close() throws IOException {
}
}

View File

@@ -0,0 +1,152 @@
package com.lanyuanxiaoyao.service.executor.task.functions.pulsar;
import cn.hutool.core.collection.ListUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil;
import com.lanyuanxiaoyao.service.executor.task.entity.RecordView;
import java.io.Serializable;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.ArrayDeque;
import java.util.List;
import java.util.Locale;
import java.util.Queue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import org.apache.flink.api.connector.source.ReaderOutput;
import org.apache.flink.api.connector.source.SourceReader;
import org.apache.flink.api.connector.source.SourceReaderContext;
import org.apache.flink.core.io.InputStatus;
import org.apache.pulsar.client.api.*;
import org.apache.pulsar.client.internal.DefaultImplementation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author lanyuanxiaoyao
* @date 2024-01-18
*/
public class ReadPulsarSourceReader implements SourceReader<RecordView, ReadPulsarSplit>, Serializable {
private static final Logger logger = LoggerFactory.getLogger(ReadPulsarSourceReader.class);
private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS")
.withLocale(Locale.CHINA)
.withZone(ZoneId.systemDefault());
private final Queue<ReadPulsarSplit> readQueue = new ArrayDeque<>();
private final SourceReaderContext readerContext;
private CompletableFuture<Void> availability = new CompletableFuture<>();
private ReadPulsarSplit currentSplit;
private boolean noMoreSplits = false;
public ReadPulsarSourceReader(SourceReaderContext readerContext) throws PulsarClientException {
this.readerContext = readerContext;
}
private static MessageId parseMessageId(String messageIdText) {
String[] items = messageIdText.split(":");
return DefaultImplementation.newMessageId(Long.parseLong(items[0]), Long.parseLong(items[1]), -1);
}
@Override
public void start() {
if (readQueue.isEmpty()) {
readerContext.sendSplitRequest();
}
}
private RecordView parsePulsarMessage(Message<byte[]> message) {
return new RecordView(
RecordView.Operation.SOURCE,
new String(message.getValue()),
FORMATTER.format(Instant.ofEpochMilli(message.getPublishTime())),
message.getMessageId().toString()
);
}
@Override
public InputStatus pollNext(ReaderOutput<RecordView> output) throws Exception {
if (ObjectUtil.isNotNull(currentSplit)) {
logger.info("Read split: {}", currentSplit);
try (PulsarClient client = PulsarClient.builder()
.serviceUrl(currentSplit.getPulsarUrl())
.build()) {
try (Consumer<byte[]> consumer = client.newConsumer()
.topic(currentSplit.getPulsarTopic())
.batchReceivePolicy(
BatchReceivePolicy.builder()
.timeout(1, TimeUnit.SECONDS)
.maxNumMessages(0)
.maxNumBytes(0)
.build()
)
.receiverQueueSize(50000)
.subscriptionInitialPosition(SubscriptionInitialPosition.Earliest)
.subscriptionMode(SubscriptionMode.NonDurable)
.subscriptionType(SubscriptionType.Exclusive)
.subscriptionName(StrUtil.format("Task_Reader_{}_{}", currentSplit.getTaskId(), readerContext.getIndexOfSubtask()))
.startMessageIdInclusive()
.subscribe()) {
consumer.seek(currentSplit.getStartTimestamp());
Messages<byte[]> messages = consumer.batchReceive();
while (ObjectUtil.isNotNull(messages)) {
long currentTimestamp = 0;
for (Message<byte[]> message : messages) {
currentTimestamp = message.getPublishTime();
output.collect(parsePulsarMessage(message));
}
consumer.acknowledge(messages);
if (currentTimestamp > currentSplit.getEndTimestamp()) {
logger.info("Break for {} -> {}", currentTimestamp, currentSplit.getEndTimestamp());
break;
}
messages = consumer.batchReceive();
}
}
}
}
return tryMoveToNextSplit();
}
private InputStatus tryMoveToNextSplit() {
currentSplit = readQueue.poll();
logger.info("Current split: {}", currentSplit);
if (ObjectUtil.isNotNull(currentSplit)) {
return InputStatus.MORE_AVAILABLE;
} else if (noMoreSplits) {
return InputStatus.END_OF_INPUT;
} else {
if (availability.isDone()) {
availability = new CompletableFuture<>();
}
return InputStatus.NOTHING_AVAILABLE;
}
}
@Override
public List<ReadPulsarSplit> snapshotState(long checkpointId) {
return ListUtil.empty();
}
@Override
public CompletableFuture<Void> isAvailable() {
return availability;
}
@Override
public void addSplits(List<ReadPulsarSplit> splits) {
logger.info("Add splits: {}", splits);
readQueue.addAll(splits);
availability.complete(null);
}
@Override
public void notifyNoMoreSplits() {
logger.info("No more splits for {}", readerContext.getIndexOfSubtask());
noMoreSplits = true;
availability.complete(null);
}
@Override
public void close() {
}
}

View File

@@ -0,0 +1,94 @@
package com.lanyuanxiaoyao.service.executor.task.functions.pulsar;
import java.io.Serializable;
import org.apache.flink.api.connector.source.SourceSplit;
/**
* @author lanyuanxiaoyao
* @date 2024-01-18
*/
public class ReadPulsarSplit implements SourceSplit, Serializable {
private String taskId;
private String pulsarUrl;
private String pulsarTopic;
private String latestMessageId;
private Long startTimestamp;
private Long endTimestamp;
public ReadPulsarSplit() {
}
public ReadPulsarSplit(String taskId, String pulsarUrl, String pulsarTopic, String latestMessageId, Long startTimestamp, Long endTimestamp) {
this.taskId = taskId;
this.pulsarUrl = pulsarUrl;
this.pulsarTopic = pulsarTopic;
this.latestMessageId = latestMessageId;
this.startTimestamp = startTimestamp;
this.endTimestamp = endTimestamp;
}
public String getTaskId() {
return taskId;
}
public void setTaskId(String taskId) {
this.taskId = taskId;
}
public String getPulsarUrl() {
return pulsarUrl;
}
public void setPulsarUrl(String pulsarUrl) {
this.pulsarUrl = pulsarUrl;
}
public String getPulsarTopic() {
return pulsarTopic;
}
public void setPulsarTopic(String pulsarTopic) {
this.pulsarTopic = pulsarTopic;
}
public String getLatestMessageId() {
return latestMessageId;
}
public void setLatestMessageId(String latestMessageId) {
this.latestMessageId = latestMessageId;
}
public Long getStartTimestamp() {
return startTimestamp;
}
public void setStartTimestamp(Long startTimestamp) {
this.startTimestamp = startTimestamp;
}
public Long getEndTimestamp() {
return endTimestamp;
}
public void setEndTimestamp(Long endTimestamp) {
this.endTimestamp = endTimestamp;
}
@Override
public String splitId() {
return taskId + pulsarUrl + pulsarTopic + startTimestamp + endTimestamp + latestMessageId;
}
@Override
public String toString() {
return "ReadPulsarSplit{" +
"taskId='" + taskId + '\'' +
", pulsarUrl='" + pulsarUrl + '\'' +
", pulsarTopic='" + pulsarTopic + '\'' +
", latestMessageId='" + latestMessageId + '\'' +
", startTimestamp=" + startTimestamp +
", endTimestamp=" + endTimestamp +
'}';
}
}

View File

@@ -0,0 +1,33 @@
package com.lanyuanxiaoyao.service.executor.task.functions.pulsar;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import org.apache.flink.core.io.SimpleVersionedSerializer;
/**
* @author lanyuanxiaoyao
* @date 2024-01-18
*/
public class ReadPulsarVersionedCheckpointSerializer implements SimpleVersionedSerializer<Collection<ReadPulsarSplit>>, Serializable {
private final ObjectMapper mapper = new ObjectMapper();
@Override
public int getVersion() {
return 0;
}
@Override
public byte[] serialize(Collection<ReadPulsarSplit> obj) throws IOException {
return mapper.writeValueAsBytes(obj);
}
@Override
public Collection<ReadPulsarSplit> deserialize(int version, byte[] serialized) throws IOException {
return mapper.readValue(serialized, new TypeReference<List<ReadPulsarSplit>>() {
});
}
}

View File

@@ -0,0 +1,29 @@
package com.lanyuanxiaoyao.service.executor.task.functions.pulsar;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import org.apache.flink.core.io.SimpleVersionedSerializer;
/**
* @author lanyuanxiaoyao
* @date 2024-01-18
*/
public class ReadPulsarVersionedSplitSerializer implements SimpleVersionedSerializer<ReadPulsarSplit>, Serializable {
private final ObjectMapper mapper = new ObjectMapper();
@Override
public int getVersion() {
return 0;
}
@Override
public byte[] serialize(ReadPulsarSplit obj) throws IOException {
return mapper.writeValueAsBytes(obj);
}
@Override
public ReadPulsarSplit deserialize(int version, byte[] serialized) throws IOException {
return mapper.readValue(serialized, ReadPulsarSplit.class);
}
}

View File

@@ -28,7 +28,7 @@ public class FlinkHelper {
public static StreamExecutionEnvironment getBatchEnvironment() {
StreamExecutionEnvironment environment = getSteamEnvironment();
environment.setRuntimeMode(RuntimeExecutionMode.BATCH);
environment.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
return environment;
}

View File

@@ -155,6 +155,18 @@ public interface InfoService {
@Get("/info/all_hdfs")
ImmutableList<String> allHdfs(@Query("key") String key);
@Get("/info/all_pulsar")
ImmutableList<String> allPulsar();
@Get("/info/all_pulsar")
ImmutableList<String> allPulsar(@Query("key") String key);
@Get("/info/all_pulsar_topic")
ImmutableList<String> allPulsarTopic();
@Get("/info/all_pulsar_topic")
ImmutableList<String> allPulsarTopic(@Query("key") String key);
@Get("/info/simple_table_metas")
ImmutableList<SimpleTableMeta> simpleTableMetas();

View File

@@ -14,10 +14,15 @@ import org.eclipse.collections.api.list.ImmutableList;
@BaseRequest(baseURL = "http://service-executor-manager")
public interface TaskService {
@Get(value = "/task/scan", readTimeout = 2 * 60 * 1000)
String scan(@Query("hdfs") String hdfs, @Query("key") String key);
@Get(value = "/task/scan", readTimeout = 2 * 60 * 1000)
String scan(@Query("hdfs") String hdfs, @Query("key") String key, @Query("scan_log") Boolean scanLog, @Query("scan_data") Boolean scanData);
String scan(
@Query("key") String key,
@Query("hdfs") String hdfs,
@Query("pulsar") String pulsar,
@Query("pulsar_topic") String pulsarTopic,
@Query("scan_queue") Boolean scanQueue,
@Query("scan_log") Boolean scanLog,
@Query("scan_base") Boolean scanBase
);
@Get("/task/results")
ImmutableList<String> results(@Query("task_id") String taskId);

View File

@@ -96,8 +96,24 @@ public class InfoController {
@GetMapping("/all_hdfs")
public ImmutableList<String> allHdfs(@RequestParam(value = "key", required = false) String key) {
return infoService.allTableInfoSearchCache()
.select(cache -> StrUtil.isBlank(key) || StrUtil.contains(cache.getAlias(), key))
.select(cache -> StrUtil.isBlank(key) || StrUtil.contains(cache.getAlias(), key) || StrUtil.contains(cache.getHdfs(), key))
.collect(TableInfoSearchCache::getHdfs)
.distinct();
}
@GetMapping("/all_pulsar")
public ImmutableList<String> allPulsar(@RequestParam(value = "key", required = false) String key) {
return infoService.allTableInfoSearchCache()
.select(cache -> StrUtil.isBlank(key) || StrUtil.contains(cache.getAlias(), key) || StrUtil.contains(cache.getPulsar(), key))
.collect(TableInfoSearchCache::getPulsar)
.distinct();
}
@GetMapping("/all_pulsar_topic")
public ImmutableList<String> allPulsarTopic(@RequestParam(value = "key", required = false) String key) {
return infoService.allTableInfoSearchCache()
.select(cache -> StrUtil.isBlank(key) || StrUtil.contains(cache.getAlias(), key) || StrUtil.contains(cache.getTopic(), key))
.collect(TableInfoSearchCache::getTopic)
.distinct();
}
}

View File

@@ -9,7 +9,6 @@ import com.lanyuanxiaoyao.service.configuration.entity.PageResponse;
import com.lanyuanxiaoyao.service.configuration.entity.info.JobAndMetas;
import com.lanyuanxiaoyao.service.configuration.entity.info.JobIdAndAlias;
import com.lanyuanxiaoyao.service.configuration.entity.info.TableInfoSearchCache;
import com.lanyuanxiaoyao.service.info.configuration.SQLLoggerProvider;
import java.util.List;
import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.list.ImmutableList;
@@ -180,13 +179,25 @@ public class InfoService extends BaseService {
@Retryable(Throwable.class)
public ImmutableList<TableInfoSearchCache> allTableInfoSearchCache() {
return Lists.immutable.ofAll(mysqlJdbcTemplate.query(
SqlBuilder.select(TbAppCollectTableInfo.FLINK_JOB_ID_A, TbAppCollectTableInfo.ALIAS_A, TbAppCollectTableInfo.TGT_HDFS_PATH_A)
SqlBuilder.select(
TbAppCollectTableInfo.FLINK_JOB_ID_A,
TbAppCollectTableInfo.ALIAS_A,
TbAppCollectTableInfo.TGT_HDFS_PATH_A,
TbAppCollectTableInfo.SRC_PULSAR_ADDR_A,
TbAppCollectTableInfo.SRC_TOPIC_A
)
.from(TbAppCollectTableInfo._alias_, TbAppFlinkJobConfig._alias_)
.whereEq(TbAppCollectTableInfo.FLINK_JOB_ID_A, Column.as(TbAppFlinkJobConfig.ID_A))
.andEq(TbAppFlinkJobConfig.STATUS_A, "y")
.andEq(TbAppCollectTableInfo.STATUS_A, "y")
.build(),
(rs, row) -> new TableInfoSearchCache(rs.getLong(1), rs.getString(2), rs.getString(3))
(rs, row) -> new TableInfoSearchCache(
rs.getLong(1),
rs.getString(2),
rs.getString(3),
rs.getString(4),
rs.getString(5)
)
));
}
}

View File

@@ -220,4 +220,22 @@ public class TableController extends BaseController {
}
return AmisResponse.responseSuccess(infoService.allHdfs(key).collect(Item::new));
}
@SuppressWarnings("DataFlowIssue")
@GetMapping("all_pulsar")
public AmisResponse<ImmutableList<Item>> allPulsar(@RequestParam(value = "key", required = false) String key) {
if (StrUtil.isBlank(key)) {
return AmisResponse.responseSuccess(infoService.allPulsar().collect(Item::new));
}
return AmisResponse.responseSuccess(infoService.allPulsar(key).collect(Item::new));
}
@SuppressWarnings("DataFlowIssue")
@GetMapping("all_pulsar_topic")
public AmisResponse<ImmutableList<Item>> allPulsarTopic(@RequestParam(value = "key", required = false) String key) {
if (StrUtil.isBlank(key)) {
return AmisResponse.responseSuccess(infoService.allPulsarTopic().collect(Item::new));
}
return AmisResponse.responseSuccess(infoService.allPulsarTopic(key).collect(Item::new));
}
}

View File

@@ -2,6 +2,7 @@ package com.lanyuanxiaoyao.service.web.controller;
import cn.hutool.core.util.StrUtil;
import com.lanyuanxiaoyao.service.configuration.ExecutorProvider;
import com.lanyuanxiaoyao.service.forest.service.PulsarService;
import com.lanyuanxiaoyao.service.forest.service.TaskService;
import com.lanyuanxiaoyao.service.web.controller.base.AmisMapResponse;
import com.lanyuanxiaoyao.service.web.controller.base.AmisResponse;
@@ -25,24 +26,35 @@ public class TaskController {
private static final Logger logger = LoggerFactory.getLogger(TaskController.class);
private final TaskService taskService;
private final PulsarService pulsarService;
public TaskController(TaskService taskService) {
public TaskController(TaskService taskService, PulsarService pulsarService) {
this.taskService = taskService;
this.pulsarService = pulsarService;
}
@GetMapping("scan")
public AmisResponse<Object> scan(
@RequestParam("hdfs") String hdfs,
@RequestParam("key") String key,
@RequestParam(value = "hdfs", required = false) String hdfs,
@RequestParam(value = "pulsar", required = false) String pulsar,
@RequestParam(value = "topic", required = false) String topic,
@RequestParam(value = "mode", defaultValue = "") String mode
) {
if (StrUtil.isBlank(hdfs) || StrUtil.isBlank(key)) {
throw new RuntimeException("Argument cannot be blank");
if (StrUtil.isBlank(key)) {
throw new RuntimeException("Key cannot be blank");
}
boolean scanQueue = StrUtil.contains(mode, "queue");
boolean scanLog = StrUtil.contains(mode, "log");
boolean scanBase = StrUtil.contains(mode, "base");
if (scanQueue && (StrUtil.isBlank(topic) || StrUtil.isBlank(pulsar))) {
throw new RuntimeException("Pulsar topic or url cannot be empty");
}
if ((scanLog || scanBase) && StrUtil.isBlank(hdfs)) {
throw new RuntimeException("Hdfs path cannot be empty");
}
ExecutorProvider.EXECUTORS.submit(() -> {
boolean scanLog = StrUtil.contains(mode, "log");
boolean scanData = StrUtil.contains(mode, "data");
String applicationId = taskService.scan(hdfs, key, scanLog, scanData);
String applicationId = taskService.scan(key, hdfs, pulsar, topic, scanQueue, scanLog, scanBase);
logger.info("Task: {}", applicationId);
});
return AmisResponse.responseSuccess();

View File

@@ -15,8 +15,10 @@ function taskTab() {
method: 'get',
url: '${base}/task/scan',
data: {
hdfs: '${hdfs|default:undefined}',
key: '${key|default:undefined}',
hdfs: '${hdfs|default:undefined}',
pulsar: '${pulsar|default:undefined}',
topic: '${topic|default:undefined}',
mode: '${scan_mode|default:undefined}',
}
}
@@ -31,31 +33,52 @@ function taskTab() {
required: true,
value: 'log',
options: [
{label: '消息队列', value: 'queue'},
{label: '日志文件', value: 'log'},
{label: '数据文件', value: 'data'},
{label: '数据文件', value: 'base'},
]
},
{
type: 'input-text',
name: 'key',
label: '检索字段',
required: true,
clearable: true,
description: '检索带有该字符的记录',
},
{
type: 'input-text',
name: 'hdfs',
label: 'HDFS路经',
requiredOn: '${CONTAINS(scan_mode, \'log\') || CONTAINS(scan_mode, \'base\')}',
visibleOn: '${CONTAINS(scan_mode, \'log\') || CONTAINS(scan_mode, \'base\')}',
clearable: true,
description: '输入表HDFS路径',
autoComplete: '${base}/table/all_hdfs?key=$term',
},
{
type: 'group',
body: [
{
type: 'input-text',
name: 'key',
label: '检索字段',
required: true,
name: 'topic',
label: 'Pulsar主题',
requiredOn: '${CONTAINS(scan_mode, \'queue\')}',
visibleOn: '${CONTAINS(scan_mode, \'queue\')}',
clearable: true,
description: '检索带有该字符的记录',
description: '输入Pulsar主题',
autoComplete: '${base}/table/all_pulsar_topic?key=$term',
columnRatio: 4,
},
{
type: 'input-text',
name: 'hdfs',
label: 'HDFS路经',
required: true,
name: 'pulsar',
label: 'Pulsar地址',
requiredOn: '${CONTAINS(scan_mode, \'queue\')}',
visibleOn: '${CONTAINS(scan_mode, \'queue\')}',
clearable: true,
description: '输入表HDFS路径',
autoComplete: '${base}/table/all_hdfs?key=$term',
columnRatio: 8,
description: '输入Pulsar地址',
autoComplete: '${base}/table/all_pulsar?key=$term',
},
]
}