diff --git a/service-ai/service-ai-knowledge/pom.xml b/service-ai/service-ai-knowledge/pom.xml index 098c03a..92caba7 100644 --- a/service-ai/service-ai-knowledge/pom.xml +++ b/service-ai/service-ai-knowledge/pom.xml @@ -46,6 +46,14 @@ com.yomahub liteflow-spring-boot-starter + + org.springframework.ai + spring-ai-tika-document-reader + + + org.springframework.ai + spring-ai-pdf-document-reader + diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java index 0cca981..688637a 100644 --- a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java +++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/controller/KnowledgeController.java @@ -2,7 +2,7 @@ package com.lanyuanxiaoyao.service.ai.knowledge.controller; import com.lanyuanxiaoyao.service.ai.knowledge.entity.vo.KnowledgeVO; import com.lanyuanxiaoyao.service.ai.knowledge.entity.vo.PointVO; -import com.lanyuanxiaoyao.service.ai.knowledge.reader.TextLineReader; +import com.lanyuanxiaoyao.service.ai.knowledge.service.EmbeddingService; import com.lanyuanxiaoyao.service.ai.knowledge.service.KnowledgeService; import io.qdrant.client.QdrantClient; import io.qdrant.client.grpc.Points; @@ -14,7 +14,6 @@ import org.eclipse.collections.api.list.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.embedding.EmbeddingModel; -import org.springframework.ai.reader.TextReader; import org.springframework.ai.reader.markdown.MarkdownDocumentReader; import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; import org.springframework.ai.vectorstore.VectorStore; @@ -37,11 +36,13 @@ public class KnowledgeController { private static final Logger logger = LoggerFactory.getLogger(KnowledgeController.class); private final KnowledgeService knowledgeService; + private final EmbeddingService embeddingService; private final QdrantClient client; private final EmbeddingModel embeddingModel; - public KnowledgeController(KnowledgeService knowledgeService, VectorStore vectorStore, EmbeddingModel embeddingModel) { + public KnowledgeController(KnowledgeService knowledgeService, EmbeddingService embeddingService, VectorStore vectorStore, EmbeddingModel embeddingModel) { this.knowledgeService = knowledgeService; + this.embeddingService = embeddingService; client = (QdrantClient) vectorStore.getNativeClient().orElseThrow(); this.embeddingModel = embeddingModel; } @@ -89,15 +90,11 @@ public class KnowledgeController { @PostMapping("preview_text") public ImmutableList previewText( - @RequestParam("name") String name, - @RequestParam(value = "mode", defaultValue = "normal") String mode, + @RequestParam(value = "mode", defaultValue = "NORMAL") String mode, @RequestParam(value = "type", defaultValue = "text") String type, @RequestParam("content") String content ) { - TextReader reader = new TextLineReader(new ByteArrayResource(content.getBytes(StandardCharsets.UTF_8))); - return reader.get() - .stream() - .collect(Collectors.toCollection(Lists.mutable::empty)) + return embeddingService.split(mode, content) .collect(doc -> { PointVO vo = new PointVO(); vo.setId(doc.getId()); diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/entity/EmbeddingContext.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/entity/EmbeddingContext.java new file mode 100644 index 0000000..5ecd72f --- /dev/null +++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/entity/EmbeddingContext.java @@ -0,0 +1,130 @@ +package com.lanyuanxiaoyao.service.ai.knowledge.entity; + +import cn.hutool.core.util.StrUtil; +import java.io.File; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.eclipse.collections.api.factory.Lists; +import org.eclipse.collections.api.factory.Maps; +import org.springframework.ai.document.Document; + +/** + * @author lanyuanxiaoyao + * @version 20250523 + */ +public class EmbeddingContext { + private Config config; + private String content; + private String file; + private List documents = Lists.mutable.empty(); + private Map metadata = Maps.mutable.empty(); + + public EmbeddingContext(String content) { + this(content, new Config()); + } + + public EmbeddingContext(String content, Config config) { + this.content = StrUtil.trim(content); + this.config = config; + } + + public EmbeddingContext(File file) { + this(file, new Config()); + } + + public EmbeddingContext(File file, Config config) { + this.file = file.getAbsolutePath(); + this.config = config; + } + + public EmbeddingContext(Path path) { + this(path.toFile()); + } + + public EmbeddingContext(Path path, Config config) { + this(path.toFile(), config); + } + + public Config getConfig() { + return config; + } + + public void setConfig(Config config) { + this.config = config; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getFile() { + return file; + } + + public void setFile(String file) { + this.file = file; + } + + public List getDocuments() { + return documents; + } + + public void setDocuments(List documents) { + this.documents = documents; + } + + public Map getMetadata() { + return metadata; + } + + public void setMetadata(Map metadata) { + this.metadata = metadata; + } + + @Override + public String toString() { + return "EmbeddingContext{" + + "config=" + config + + ", content='" + content + '\'' + + ", file='" + file + '\'' + + ", documents=" + documents + + ", metadata=" + metadata + + '}'; + } + + public static final class Config { + private SplitStrategy splitStrategy = SplitStrategy.NORMAL; + + public Config() { + } + + public Config(SplitStrategy splitStrategy) { + this.splitStrategy = splitStrategy; + } + + public SplitStrategy getSplitStrategy() { + return splitStrategy; + } + + public void setSplitStrategy(SplitStrategy splitStrategy) { + this.splitStrategy = splitStrategy; + } + + @Override + public String toString() { + return "Config{" + + "splitStrategy=" + splitStrategy + + '}'; + } + + public enum SplitStrategy { + NORMAL, LLM, QA + } + } +} diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/reader/TextLineReader.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/reader/TextLineReader.java deleted file mode 100644 index ceb1c28..0000000 --- a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/reader/TextLineReader.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.lanyuanxiaoyao.service.ai.knowledge.reader; - -import cn.hutool.core.util.StrUtil; -import java.util.List; -import java.util.stream.Stream; -import org.springframework.ai.document.Document; -import org.springframework.ai.reader.TextReader; -import org.springframework.core.io.Resource; - -/** - * @author lanyuanxiaoyao - * @version 20250522 - */ -public class TextLineReader extends TextReader { - public TextLineReader(Resource resource) { - super(resource); - } - - @Override - public List get() { - return super.get() - .stream() - .flatMap(doc -> { - String text = doc.getText(); - if (StrUtil.isBlank(text)) { - return Stream.of(doc); - } - return Stream.of(text.split("\n\n")) - .filter(StrUtil::isNotBlank) - .map(line -> new Document(line, doc.getMetadata())); - }) - .toList(); - } -} diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java index f85466f..bb3f901 100644 --- a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java +++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/EmbeddingService.java @@ -1,7 +1,13 @@ package com.lanyuanxiaoyao.service.ai.knowledge.service; +import com.lanyuanxiaoyao.service.ai.knowledge.entity.EmbeddingContext; +import com.yomahub.liteflow.core.FlowExecutor; +import java.nio.file.Path; +import org.eclipse.collections.api.factory.Lists; +import org.eclipse.collections.api.list.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.ai.document.Document; import org.springframework.stereotype.Service; /** @@ -12,5 +18,19 @@ import org.springframework.stereotype.Service; public class EmbeddingService { private static final Logger logger = LoggerFactory.getLogger(EmbeddingService.class); + private final FlowExecutor executor; + @SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection") + public EmbeddingService(FlowExecutor executor) { + this.executor = executor; + } + + public ImmutableList split(String mode, String content) { + EmbeddingContext context = new EmbeddingContext( + content, + new EmbeddingContext.Config(EmbeddingContext.Config.SplitStrategy.valueOf(mode)) + ); + executor.execute2Resp("embedding", null, context); + return Lists.immutable.ofAll(context.getDocuments()); + } } diff --git a/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/node/EmbeddingNodes.java b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/node/EmbeddingNodes.java new file mode 100644 index 0000000..2f57167 --- /dev/null +++ b/service-ai/service-ai-knowledge/src/main/java/com/lanyuanxiaoyao/service/ai/knowledge/service/node/EmbeddingNodes.java @@ -0,0 +1,201 @@ +package com.lanyuanxiaoyao.service.ai.knowledge.service.node; + +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.lang.Assert; +import cn.hutool.core.util.StrUtil; +import com.lanyuanxiaoyao.service.ai.knowledge.entity.EmbeddingContext; +import com.yomahub.liteflow.annotation.LiteflowComponent; +import com.yomahub.liteflow.annotation.LiteflowMethod; +import com.yomahub.liteflow.core.NodeComponent; +import com.yomahub.liteflow.enums.LiteFlowMethodEnum; +import com.yomahub.liteflow.enums.NodeTypeEnum; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.ai.chat.client.ChatClient; +import org.springframework.ai.document.Document; +import org.springframework.ai.document.DocumentReader; +import org.springframework.ai.reader.ExtractedTextFormatter; +import org.springframework.ai.reader.pdf.PagePdfDocumentReader; +import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; +import org.springframework.ai.reader.tika.TikaDocumentReader; +import org.springframework.ai.transformer.splitter.TokenTextSplitter; +import org.springframework.core.io.PathResource; + +/** + * @author lanyuanxiaoyao + * @version 20250523 + */ +@LiteflowComponent +public class EmbeddingNodes { + private static final Logger logger = LoggerFactory.getLogger(EmbeddingNodes.class); + + private final ChatClient chatClient; + + public EmbeddingNodes(ChatClient.Builder builder) { + this.chatClient = builder.build(); + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_BOOLEAN, nodeId = "embedding_check_if_file_needed", nodeName = "判断是否需要读取文件", nodeType = NodeTypeEnum.BOOLEAN) + public boolean checkIfFileReadNeeded(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + Assert.notNull(context, "EmbeddingContext is null"); + if (StrUtil.isNotBlank(context.getFile())) { + Assert.isTrue(FileUtil.exist(context.getFile()), "File [{}] not exist", context.getFile()); + return true; + } + Assert.notBlank(context.getContent(), "Contents is empty"); + return false; + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "test_print", nodeType = NodeTypeEnum.COMMON) + public void testPrint(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + logger.info(context.getContent()); + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_SWITCH, nodeId = "file_reader_switch", nodeName = "判断文件格式", nodeType = NodeTypeEnum.SWITCH) + public String fileReaderSwitch(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + String extName = FileUtil.extName(context.getFile()); + return switch (extName.toLowerCase()) { + case "txt", "md", "markdown" -> "txt_file_reader"; + case "pdf" -> "pdf_file_reader"; + case "doc", "docx", "xls", "xlsx", "ppt", "pptx", "html", "xml", "wps", "et", "dpt" -> "any_file_reader"; + default -> throw new IllegalStateException("Unsupported ext: " + extName); + }; + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "txt_file_reader", nodeName = "读取文本文件", nodeType = NodeTypeEnum.COMMON) + public void txtFileReader(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + context.setContent(FileUtil.readString(context.getFile(), Charset.defaultCharset())); + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "pdf_file_reader", nodeName = "读取pdf文件", nodeType = NodeTypeEnum.COMMON) + public void pdfFileReader(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + PagePdfDocumentReader reader = new PagePdfDocumentReader( + new PathResource(context.getFile()), + PdfDocumentReaderConfig.builder() + .withPageTopMargin(0) + .withPageExtractedTextFormatter(ExtractedTextFormatter.builder() + .withNumberOfTopTextLinesToDelete(0) + .build()) + .build()); + context.setContent(readBySpringAiReader(reader)); + } + + /** + * Tika支持的格式 + */ + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "any_file_reader", nodeName = "使用Tika尝试读取文件", nodeType = NodeTypeEnum.COMMON) + public void anyFileReader(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + context.setContent(readBySpringAiReader(new TikaDocumentReader(new PathResource(context.getFile())))); + } + + private String readBySpringAiReader(DocumentReader reader) { + return reader.get() + .stream() + .map(Document::getText) + .collect(Collectors.joining("\n")) + .trim(); + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_SWITCH, nodeId = "split_switch", nodeName = "判断使用什么分段方法", nodeType = NodeTypeEnum.SWITCH) + public String splitSwitch(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + return switch (context.getConfig().getSplitStrategy()) { + case NORMAL -> "normal_split"; + case LLM -> "llm_split"; + case QA -> "qa_split"; + }; + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "normal_split", nodeName = "使用普通分段", nodeType = NodeTypeEnum.COMMON) + public void normalSplit(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + TokenTextSplitter splitter = new TokenTextSplitter(200, 100, 5, 200, true); + Document document = Document.builder() + .text(context.getContent()) + .build(); + context.setDocuments(splitter.split(document)); + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "llm_split", nodeName = "使用大模型分段", nodeType = NodeTypeEnum.COMMON) + public void llmSplit(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + context.getDocuments().addAll(llmSplit( + """ + 对用户输入的文本,生成高质量的分段。请遵循以下指南: + 1. 分段原则: + 分段按文本内容的语义进行分割,每个分段都尽可能保持完整连续的内容表达。 + 避免从词句的中间进行分割。 + 2. 格式: + 分段之间用两个空行分隔,以提高可读性。 + 避免使用任何Markdown格式 + 3. 内容要求: + 确保每个分段的内容文字完全依照原文。 + 避免添加任何原文中不存在的文字。 + """, + context.getContent(), + context.getMetadata() + )); + } + + @LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "qa_split", nodeName = "使用Q/A格式分段", nodeType = NodeTypeEnum.COMMON) + public void qaSplit(NodeComponent node) { + EmbeddingContext context = node.getContextBean(EmbeddingContext.class); + context.getDocuments().addAll(llmSplit( + """ + 对用户输入的文本,生成一组高质量的问答对。请遵循以下指南: + 1. 问题部分: + 为同一个主题创建尽可能多的不同表述的问题,确保问题的多样性。 + 每个问题应考虑用户可能的多种问法,例如: + 直接询问(如“什么是...?”) + 请求确认(如“是否可以说...?”) + 寻求解释(如“请解释一下...的含义。”) + 假设性问题(如“如果...会怎样?”) + 例子请求(如“能否举个例子说明...?”) + 问题应涵盖文本中的关键信息、主要概念和细节,确保不遗漏重要内容。 + 2. 答案部分: + 提供一个全面、信息丰富的答案,涵盖问题的所有可能角度,确保逻辑连贯。 + 答案应直接基于给定文本,确保准确性和一致性。 + 包含相关的细节,如日期、名称、职位等具体信息,必要时提供背景信息以增强理解。 + 3. 格式: + 使用"Q:"标记问题集合的开始,所有问题应在一个段落内,问题之间用空格分隔。 + 使用"A:"标记答案的开始,答案应清晰分段,便于阅读。 + 问答对之间用两个空行分隔,以提高可读性。 + 避免使用任何Markdown格式 + 4. 内容要求: + 确保问答对紧密围绕文本主题,避免偏离主题。 + 避免添加文本中未提及的信息,确保信息的真实性。 + 如果文本信息不足以回答某个方面,可以在答案中说明 "根据给定信息无法确定",并尽量提供相关的上下文。 + """, + context.getContent(), + context.getMetadata() + )); + } + + private List llmSplit(String prompt, String content, Map metadata) { + String response = chatClient.prompt() + .system(prompt) + .user(content) + .call() + .content(); + Assert.notBlank(response, "LLM response is empty"); + // noinspection DataFlowIssue + return Arrays.stream(StrUtil.trim(response).split("(s?)\\s*\\n\\n")) + .map(StrUtil::trim) + .map(text -> Document.builder() + .text(text) + .metadata(metadata) + .build()) + .toList(); + } +} diff --git a/service-ai/service-ai-knowledge/src/main/resources/application.yml b/service-ai/service-ai-knowledge/src/main/resources/application.yml index 52b26d9..30e248d 100644 --- a/service-ai/service-ai-knowledge/src/main/resources/application.yml +++ b/service-ai/service-ai-knowledge/src/main/resources/application.yml @@ -33,7 +33,7 @@ spring: api-key: ENC(K+Hff9QGC+fcyi510VIDd9CaeK/IN5WBJ9rlkUsHEdDgIidW+stHHJlsK0lLPUXXREha+ToQZqqDXJrqSE+GUKCXklFhelD8bRHFXBIeP/ZzT2cxhzgKUXgjw3S0Qw2R) chat: options: - model: 'Qwen3-1.7' + model: 'Qwen3-1.7-vllm' embedding: options: model: 'Bge-m3' diff --git a/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml b/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml index 9b93803..08f7b43 100644 --- a/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml +++ b/service-ai/service-ai-knowledge/src/main/resources/config/flow.xml @@ -1,14 +1,20 @@ + - + SER( - embedding_start, - SWITCH(embedding_mode_switch).TO( - normal_embedding, - llm_embedding, - qa_embedding + IF( + embedding_check_if_file_needed, + SWITCH(file_reader_switch).TO( + txt_file_reader, + pdf_file_reader + ).DEFAULT(any_file_reader) ), - embedding_finish - ); + SWITCH(split_switch).TO( + normal_split, + llm_split, + qa_split + ) + ) \ No newline at end of file diff --git a/service-web/client/src/pages/ai/knowledge/DataImport.tsx b/service-web/client/src/pages/ai/knowledge/DataImport.tsx index 5df3008..3d32816 100644 --- a/service-web/client/src/pages/ai/knowledge/DataImport.tsx +++ b/service-web/client/src/pages/ai/knowledge/DataImport.tsx @@ -33,21 +33,19 @@ const DataImport: React.FC = () => { name: 'mode', type: 'radios', label: '解析模式', - value: 'normal', + value: 'NORMAL', options: [ { - value: 'normal', + value: 'NORMAL', label: '常规模式', }, { - value: 'llm', + value: 'LLM', label: '智能模式', - disabled: true, }, { - value: 'qa', + value: 'QA', label: 'Q/A模式', - disabled: true, }, ], }, @@ -105,7 +103,6 @@ const DataImport: React.FC = () => { }, dataType: 'form', data: { - name: name, mode: '${mode}', type: '${type}', content: '${content}',