diff --git a/service-ai/service-ai-knowledge/pom.xml b/service-ai/service-ai-knowledge/pom.xml
index 098c03a..92caba7 100644
--- a/service-ai/service-ai-knowledge/pom.xml
+++ b/service-ai/service-ai-knowledge/pom.xml
@@ -46,6 +46,14 @@
com.yomahub
liteflow-spring-boot-starter
+
+ org.springframework.ai
+ spring-ai-tika-document-reader
+
+
+ org.springframework.ai
+ spring-ai-pdf-document-reader
+
diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java
index 0cca981..688637a 100644
--- a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java
+++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java
@@ -2,7 +2,7 @@ package com.lanyuanxiaoyao.service.ai.knowledge.controller;
import com.lanyuanxiaoyao.service.ai.knowledge.entity.vo.KnowledgeVO;
import com.lanyuanxiaoyao.service.ai.knowledge.entity.vo.PointVO;
-import com.lanyuanxiaoyao.service.ai.knowledge.reader.TextLineReader;
+import com.lanyuanxiaoyao.service.ai.knowledge.service.EmbeddingService;
import com.lanyuanxiaoyao.service.ai.knowledge.service.KnowledgeService;
import io.qdrant.client.QdrantClient;
import io.qdrant.client.grpc.Points;
@@ -14,7 +14,6 @@ import org.eclipse.collections.api.list.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.embedding.EmbeddingModel;
-import org.springframework.ai.reader.TextReader;
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.ai.vectorstore.VectorStore;
@@ -37,11 +36,13 @@ public class KnowledgeController {
private static final Logger logger = LoggerFactory.getLogger(KnowledgeController.class);
private final KnowledgeService knowledgeService;
+ private final EmbeddingService embeddingService;
private final QdrantClient client;
private final EmbeddingModel embeddingModel;
- public KnowledgeController(KnowledgeService knowledgeService, VectorStore vectorStore, EmbeddingModel embeddingModel) {
+ public KnowledgeController(KnowledgeService knowledgeService, EmbeddingService embeddingService, VectorStore vectorStore, EmbeddingModel embeddingModel) {
this.knowledgeService = knowledgeService;
+ this.embeddingService = embeddingService;
client = (QdrantClient) vectorStore.getNativeClient().orElseThrow();
this.embeddingModel = embeddingModel;
}
@@ -89,15 +90,11 @@ public class KnowledgeController {
@PostMapping("preview_text")
public ImmutableList previewText(
- @RequestParam("name") String name,
- @RequestParam(value = "mode", defaultValue = "normal") String mode,
+ @RequestParam(value = "mode", defaultValue = "NORMAL") String mode,
@RequestParam(value = "type", defaultValue = "text") String type,
@RequestParam("content") String content
) {
- TextReader reader = new TextLineReader(new ByteArrayResource(content.getBytes(StandardCharsets.UTF_8)));
- return reader.get()
- .stream()
- .collect(Collectors.toCollection(Lists.mutable::empty))
+ return embeddingService.split(mode, content)
.collect(doc -> {
PointVO vo = new PointVO();
vo.setId(doc.getId());
diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/entity/EmbeddingContext.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/entity/EmbeddingContext.java
new file mode 100644
index 0000000..5ecd72f
--- /dev/null
+++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/entity/EmbeddingContext.java
@@ -0,0 +1,130 @@
+package com.lanyuanxiaoyao.service.ai.knowledge.entity;
+
+import cn.hutool.core.util.StrUtil;
+import java.io.File;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.eclipse.collections.api.factory.Lists;
+import org.eclipse.collections.api.factory.Maps;
+import org.springframework.ai.document.Document;
+
+/**
+ * @author lanyuanxiaoyao
+ * @version 20250523
+ */
+public class EmbeddingContext {
+ private Config config;
+ private String content;
+ private String file;
+ private List documents = Lists.mutable.empty();
+ private Map metadata = Maps.mutable.empty();
+
+ public EmbeddingContext(String content) {
+ this(content, new Config());
+ }
+
+ public EmbeddingContext(String content, Config config) {
+ this.content = StrUtil.trim(content);
+ this.config = config;
+ }
+
+ public EmbeddingContext(File file) {
+ this(file, new Config());
+ }
+
+ public EmbeddingContext(File file, Config config) {
+ this.file = file.getAbsolutePath();
+ this.config = config;
+ }
+
+ public EmbeddingContext(Path path) {
+ this(path.toFile());
+ }
+
+ public EmbeddingContext(Path path, Config config) {
+ this(path.toFile(), config);
+ }
+
+ public Config getConfig() {
+ return config;
+ }
+
+ public void setConfig(Config config) {
+ this.config = config;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ public String getFile() {
+ return file;
+ }
+
+ public void setFile(String file) {
+ this.file = file;
+ }
+
+ public List getDocuments() {
+ return documents;
+ }
+
+ public void setDocuments(List documents) {
+ this.documents = documents;
+ }
+
+ public Map getMetadata() {
+ return metadata;
+ }
+
+ public void setMetadata(Map metadata) {
+ this.metadata = metadata;
+ }
+
+ @Override
+ public String toString() {
+ return "EmbeddingContext{" +
+ "config=" + config +
+ ", content='" + content + '\'' +
+ ", file='" + file + '\'' +
+ ", documents=" + documents +
+ ", metadata=" + metadata +
+ '}';
+ }
+
+ public static final class Config {
+ private SplitStrategy splitStrategy = SplitStrategy.NORMAL;
+
+ public Config() {
+ }
+
+ public Config(SplitStrategy splitStrategy) {
+ this.splitStrategy = splitStrategy;
+ }
+
+ public SplitStrategy getSplitStrategy() {
+ return splitStrategy;
+ }
+
+ public void setSplitStrategy(SplitStrategy splitStrategy) {
+ this.splitStrategy = splitStrategy;
+ }
+
+ @Override
+ public String toString() {
+ return "Config{" +
+ "splitStrategy=" + splitStrategy +
+ '}';
+ }
+
+ public enum SplitStrategy {
+ NORMAL, LLM, QA
+ }
+ }
+}
diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/reader/TextLineReader.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/reader/TextLineReader.java
deleted file mode 100644
index ceb1c28..0000000
--- a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/reader/TextLineReader.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package com.lanyuanxiaoyao.service.ai.knowledge.reader;
-
-import cn.hutool.core.util.StrUtil;
-import java.util.List;
-import java.util.stream.Stream;
-import org.springframework.ai.document.Document;
-import org.springframework.ai.reader.TextReader;
-import org.springframework.core.io.Resource;
-
-/**
- * @author lanyuanxiaoyao
- * @version 20250522
- */
-public class TextLineReader extends TextReader {
- public TextLineReader(Resource resource) {
- super(resource);
- }
-
- @Override
- public List get() {
- return super.get()
- .stream()
- .flatMap(doc -> {
- String text = doc.getText();
- if (StrUtil.isBlank(text)) {
- return Stream.of(doc);
- }
- return Stream.of(text.split("\n\n"))
- .filter(StrUtil::isNotBlank)
- .map(line -> new Document(line, doc.getMetadata()));
- })
- .toList();
- }
-}
diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java
index f85466f..bb3f901 100644
--- a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java
+++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java
@@ -1,7 +1,13 @@
package com.lanyuanxiaoyao.service.ai.knowledge.service;
+import com.lanyuanxiaoyao.service.ai.knowledge.entity.EmbeddingContext;
+import com.yomahub.liteflow.core.FlowExecutor;
+import java.nio.file.Path;
+import org.eclipse.collections.api.factory.Lists;
+import org.eclipse.collections.api.list.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.springframework.ai.document.Document;
import org.springframework.stereotype.Service;
/**
@@ -12,5 +18,19 @@ import org.springframework.stereotype.Service;
public class EmbeddingService {
private static final Logger logger = LoggerFactory.getLogger(EmbeddingService.class);
+ private final FlowExecutor executor;
+ @SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection")
+ public EmbeddingService(FlowExecutor executor) {
+ this.executor = executor;
+ }
+
+ public ImmutableList split(String mode, String content) {
+ EmbeddingContext context = new EmbeddingContext(
+ content,
+ new EmbeddingContext.Config(EmbeddingContext.Config.SplitStrategy.valueOf(mode))
+ );
+ executor.execute2Resp("embedding", null, context);
+ return Lists.immutable.ofAll(context.getDocuments());
+ }
}
diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/node/EmbeddingNodes.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/node/EmbeddingNodes.java
new file mode 100644
index 0000000..2f57167
--- /dev/null
+++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/node/EmbeddingNodes.java
@@ -0,0 +1,201 @@
+package com.lanyuanxiaoyao.service.ai.knowledge.service.node;
+
+import cn.hutool.core.io.FileUtil;
+import cn.hutool.core.lang.Assert;
+import cn.hutool.core.util.StrUtil;
+import com.lanyuanxiaoyao.service.ai.knowledge.entity.EmbeddingContext;
+import com.yomahub.liteflow.annotation.LiteflowComponent;
+import com.yomahub.liteflow.annotation.LiteflowMethod;
+import com.yomahub.liteflow.core.NodeComponent;
+import com.yomahub.liteflow.enums.LiteFlowMethodEnum;
+import com.yomahub.liteflow.enums.NodeTypeEnum;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.ai.chat.client.ChatClient;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
+import org.springframework.ai.reader.ExtractedTextFormatter;
+import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
+import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
+import org.springframework.ai.reader.tika.TikaDocumentReader;
+import org.springframework.ai.transformer.splitter.TokenTextSplitter;
+import org.springframework.core.io.PathResource;
+
+/**
+ * @author lanyuanxiaoyao
+ * @version 20250523
+ */
+@LiteflowComponent
+public class EmbeddingNodes {
+ private static final Logger logger = LoggerFactory.getLogger(EmbeddingNodes.class);
+
+ private final ChatClient chatClient;
+
+ public EmbeddingNodes(ChatClient.Builder builder) {
+ this.chatClient = builder.build();
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_BOOLEAN, nodeId = "embedding_check_if_file_needed", nodeName = "判断是否需要读取文件", nodeType = NodeTypeEnum.BOOLEAN)
+ public boolean checkIfFileReadNeeded(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ Assert.notNull(context, "EmbeddingContext is null");
+ if (StrUtil.isNotBlank(context.getFile())) {
+ Assert.isTrue(FileUtil.exist(context.getFile()), "File [{}] not exist", context.getFile());
+ return true;
+ }
+ Assert.notBlank(context.getContent(), "Contents is empty");
+ return false;
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "test_print", nodeType = NodeTypeEnum.COMMON)
+ public void testPrint(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ logger.info(context.getContent());
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_SWITCH, nodeId = "file_reader_switch", nodeName = "判断文件格式", nodeType = NodeTypeEnum.SWITCH)
+ public String fileReaderSwitch(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ String extName = FileUtil.extName(context.getFile());
+ return switch (extName.toLowerCase()) {
+ case "txt", "md", "markdown" -> "txt_file_reader";
+ case "pdf" -> "pdf_file_reader";
+ case "doc", "docx", "xls", "xlsx", "ppt", "pptx", "html", "xml", "wps", "et", "dpt" -> "any_file_reader";
+ default -> throw new IllegalStateException("Unsupported ext: " + extName);
+ };
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "txt_file_reader", nodeName = "读取文本文件", nodeType = NodeTypeEnum.COMMON)
+ public void txtFileReader(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ context.setContent(FileUtil.readString(context.getFile(), Charset.defaultCharset()));
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "pdf_file_reader", nodeName = "读取pdf文件", nodeType = NodeTypeEnum.COMMON)
+ public void pdfFileReader(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ PagePdfDocumentReader reader = new PagePdfDocumentReader(
+ new PathResource(context.getFile()),
+ PdfDocumentReaderConfig.builder()
+ .withPageTopMargin(0)
+ .withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
+ .withNumberOfTopTextLinesToDelete(0)
+ .build())
+ .build());
+ context.setContent(readBySpringAiReader(reader));
+ }
+
+ /**
+ * Tika支持的格式
+ */
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "any_file_reader", nodeName = "使用Tika尝试读取文件", nodeType = NodeTypeEnum.COMMON)
+ public void anyFileReader(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ context.setContent(readBySpringAiReader(new TikaDocumentReader(new PathResource(context.getFile()))));
+ }
+
+ private String readBySpringAiReader(DocumentReader reader) {
+ return reader.get()
+ .stream()
+ .map(Document::getText)
+ .collect(Collectors.joining("\n"))
+ .trim();
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_SWITCH, nodeId = "split_switch", nodeName = "判断使用什么分段方法", nodeType = NodeTypeEnum.SWITCH)
+ public String splitSwitch(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ return switch (context.getConfig().getSplitStrategy()) {
+ case NORMAL -> "normal_split";
+ case LLM -> "llm_split";
+ case QA -> "qa_split";
+ };
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "normal_split", nodeName = "使用普通分段", nodeType = NodeTypeEnum.COMMON)
+ public void normalSplit(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ TokenTextSplitter splitter = new TokenTextSplitter(200, 100, 5, 200, true);
+ Document document = Document.builder()
+ .text(context.getContent())
+ .build();
+ context.setDocuments(splitter.split(document));
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "llm_split", nodeName = "使用大模型分段", nodeType = NodeTypeEnum.COMMON)
+ public void llmSplit(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ context.getDocuments().addAll(llmSplit(
+ """
+ 对用户输入的文本,生成高质量的分段。请遵循以下指南:
+ 1. 分段原则:
+ 分段按文本内容的语义进行分割,每个分段都尽可能保持完整连续的内容表达。
+ 避免从词句的中间进行分割。
+ 2. 格式:
+ 分段之间用两个空行分隔,以提高可读性。
+ 避免使用任何Markdown格式
+ 3. 内容要求:
+ 确保每个分段的内容文字完全依照原文。
+ 避免添加任何原文中不存在的文字。
+ """,
+ context.getContent(),
+ context.getMetadata()
+ ));
+ }
+
+ @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "qa_split", nodeName = "使用Q/A格式分段", nodeType = NodeTypeEnum.COMMON)
+ public void qaSplit(NodeComponent node) {
+ EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
+ context.getDocuments().addAll(llmSplit(
+ """
+ 对用户输入的文本,生成一组高质量的问答对。请遵循以下指南:
+ 1. 问题部分:
+ 为同一个主题创建尽可能多的不同表述的问题,确保问题的多样性。
+ 每个问题应考虑用户可能的多种问法,例如:
+ 直接询问(如“什么是...?”)
+ 请求确认(如“是否可以说...?”)
+ 寻求解释(如“请解释一下...的含义。”)
+ 假设性问题(如“如果...会怎样?”)
+ 例子请求(如“能否举个例子说明...?”)
+ 问题应涵盖文本中的关键信息、主要概念和细节,确保不遗漏重要内容。
+ 2. 答案部分:
+ 提供一个全面、信息丰富的答案,涵盖问题的所有可能角度,确保逻辑连贯。
+ 答案应直接基于给定文本,确保准确性和一致性。
+ 包含相关的细节,如日期、名称、职位等具体信息,必要时提供背景信息以增强理解。
+ 3. 格式:
+ 使用"Q:"标记问题集合的开始,所有问题应在一个段落内,问题之间用空格分隔。
+ 使用"A:"标记答案的开始,答案应清晰分段,便于阅读。
+ 问答对之间用两个空行分隔,以提高可读性。
+ 避免使用任何Markdown格式
+ 4. 内容要求:
+ 确保问答对紧密围绕文本主题,避免偏离主题。
+ 避免添加文本中未提及的信息,确保信息的真实性。
+ 如果文本信息不足以回答某个方面,可以在答案中说明 "根据给定信息无法确定",并尽量提供相关的上下文。
+ """,
+ context.getContent(),
+ context.getMetadata()
+ ));
+ }
+
+ private List llmSplit(String prompt, String content, Map metadata) {
+ String response = chatClient.prompt()
+ .system(prompt)
+ .user(content)
+ .call()
+ .content();
+ Assert.notBlank(response, "LLM response is empty");
+ // noinspection DataFlowIssue
+ return Arrays.stream(StrUtil.trim(response).split("(s?)\\s*\\n\\n"))
+ .map(StrUtil::trim)
+ .map(text -> Document.builder()
+ .text(text)
+ .metadata(metadata)
+ .build())
+ .toList();
+ }
+}
diff --git a/service-ai/service-ai-knowledge/src/main/resources/application.yml b/service-ai/service-ai-knowledge/src/main/resources/application.yml
index 52b26d9..30e248d 100644
--- a/service-ai/service-ai-knowledge/src/main/resources/application.yml
+++ b/service-ai/service-ai-knowledge/src/main/resources/application.yml
@@ -33,7 +33,7 @@ spring:
api-key: ENC(K+Hff9QGC+fcyi510VIDd9CaeK/IN5WBJ9rlkUsHEdDgIidW+stHHJlsK0lLPUXXREha+ToQZqqDXJrqSE+GUKCXklFhelD8bRHFXBIeP/ZzT2cxhzgKUXgjw3S0Qw2R)
chat:
options:
- model: 'Qwen3-1.7'
+ model: 'Qwen3-1.7-vllm'
embedding:
options:
model: 'Bge-m3'
diff --git a/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml b/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml
index 9b93803..08f7b43 100644
--- a/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml
+++ b/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml
@@ -1,14 +1,20 @@
+
-
+
SER(
- embedding_start,
- SWITCH(embedding_mode_switch).TO(
- normal_embedding,
- llm_embedding,
- qa_embedding
+ IF(
+ embedding_check_if_file_needed,
+ SWITCH(file_reader_switch).TO(
+ txt_file_reader,
+ pdf_file_reader
+ ).DEFAULT(any_file_reader)
),
- embedding_finish
- );
+ SWITCH(split_switch).TO(
+ normal_split,
+ llm_split,
+ qa_split
+ )
+ )
\ No newline at end of file
diff --git a/service-web/client/src/pages/ai/knowledge/DataImport.tsx b/service-web/client/src/pages/ai/knowledge/DataImport.tsx
index 5df3008..3d32816 100644
--- a/service-web/client/src/pages/ai/knowledge/DataImport.tsx
+++ b/service-web/client/src/pages/ai/knowledge/DataImport.tsx
@@ -33,21 +33,19 @@ const DataImport: React.FC = () => {
name: 'mode',
type: 'radios',
label: '解析模式',
- value: 'normal',
+ value: 'NORMAL',
options: [
{
- value: 'normal',
+ value: 'NORMAL',
label: '常规模式',
},
{
- value: 'llm',
+ value: 'LLM',
label: '智能模式',
- disabled: true,
},
{
- value: 'qa',
+ value: 'QA',
label: 'Q/A模式',
- disabled: true,
},
],
},
@@ -105,7 +103,6 @@ const DataImport: React.FC = () => {
},
dataType: 'form',
data: {
- name: name,
mode: '${mode}',
type: '${type}',
content: '${content}',