feat(knowledge): 初步完成知识库分片预览

This commit is contained in:
v-zhangjc9
2025-05-23 19:12:41 +08:00
parent fce4816880
commit e57c81ce75
9 changed files with 384 additions and 59 deletions

View File

@@ -46,6 +46,14 @@
<groupId>com.yomahub</groupId>
<artifactId>liteflow-spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-tika-document-reader</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>
</dependencies>
<build>

View File

@@ -2,7 +2,7 @@ package com.lanyuanxiaoyao.service.ai.knowledge.controller;
import com.lanyuanxiaoyao.service.ai.knowledge.entity.vo.KnowledgeVO;
import com.lanyuanxiaoyao.service.ai.knowledge.entity.vo.PointVO;
import com.lanyuanxiaoyao.service.ai.knowledge.reader.TextLineReader;
import com.lanyuanxiaoyao.service.ai.knowledge.service.EmbeddingService;
import com.lanyuanxiaoyao.service.ai.knowledge.service.KnowledgeService;
import io.qdrant.client.QdrantClient;
import io.qdrant.client.grpc.Points;
@@ -14,7 +14,6 @@ import org.eclipse.collections.api.list.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.embedding.EmbeddingModel;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.ai.vectorstore.VectorStore;
@@ -37,11 +36,13 @@ public class KnowledgeController {
private static final Logger logger = LoggerFactory.getLogger(KnowledgeController.class);
private final KnowledgeService knowledgeService;
private final EmbeddingService embeddingService;
private final QdrantClient client;
private final EmbeddingModel embeddingModel;
public KnowledgeController(KnowledgeService knowledgeService, VectorStore vectorStore, EmbeddingModel embeddingModel) {
public KnowledgeController(KnowledgeService knowledgeService, EmbeddingService embeddingService, VectorStore vectorStore, EmbeddingModel embeddingModel) {
this.knowledgeService = knowledgeService;
this.embeddingService = embeddingService;
client = (QdrantClient) vectorStore.getNativeClient().orElseThrow();
this.embeddingModel = embeddingModel;
}
@@ -89,15 +90,11 @@ public class KnowledgeController {
@PostMapping("preview_text")
public ImmutableList<PointVO> previewText(
@RequestParam("name") String name,
@RequestParam(value = "mode", defaultValue = "normal") String mode,
@RequestParam(value = "mode", defaultValue = "NORMAL") String mode,
@RequestParam(value = "type", defaultValue = "text") String type,
@RequestParam("content") String content
) {
TextReader reader = new TextLineReader(new ByteArrayResource(content.getBytes(StandardCharsets.UTF_8)));
return reader.get()
.stream()
.collect(Collectors.toCollection(Lists.mutable::empty))
return embeddingService.split(mode, content)
.collect(doc -> {
PointVO vo = new PointVO();
vo.setId(doc.getId());

View File

@@ -0,0 +1,130 @@
package com.lanyuanxiaoyao.service.ai.knowledge.entity;
import cn.hutool.core.util.StrUtil;
import java.io.File;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.factory.Maps;
import org.springframework.ai.document.Document;
/**
* @author lanyuanxiaoyao
* @version 20250523
*/
public class EmbeddingContext {
private Config config;
private String content;
private String file;
private List<Document> documents = Lists.mutable.empty();
private Map<String, Object> metadata = Maps.mutable.empty();
public EmbeddingContext(String content) {
this(content, new Config());
}
public EmbeddingContext(String content, Config config) {
this.content = StrUtil.trim(content);
this.config = config;
}
public EmbeddingContext(File file) {
this(file, new Config());
}
public EmbeddingContext(File file, Config config) {
this.file = file.getAbsolutePath();
this.config = config;
}
public EmbeddingContext(Path path) {
this(path.toFile());
}
public EmbeddingContext(Path path, Config config) {
this(path.toFile(), config);
}
public Config getConfig() {
return config;
}
public void setConfig(Config config) {
this.config = config;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getFile() {
return file;
}
public void setFile(String file) {
this.file = file;
}
public List<Document> getDocuments() {
return documents;
}
public void setDocuments(List<Document> documents) {
this.documents = documents;
}
public Map<String, Object> getMetadata() {
return metadata;
}
public void setMetadata(Map<String, Object> metadata) {
this.metadata = metadata;
}
@Override
public String toString() {
return "EmbeddingContext{" +
"config=" + config +
", content='" + content + '\'' +
", file='" + file + '\'' +
", documents=" + documents +
", metadata=" + metadata +
'}';
}
public static final class Config {
private SplitStrategy splitStrategy = SplitStrategy.NORMAL;
public Config() {
}
public Config(SplitStrategy splitStrategy) {
this.splitStrategy = splitStrategy;
}
public SplitStrategy getSplitStrategy() {
return splitStrategy;
}
public void setSplitStrategy(SplitStrategy splitStrategy) {
this.splitStrategy = splitStrategy;
}
@Override
public String toString() {
return "Config{" +
"splitStrategy=" + splitStrategy +
'}';
}
public enum SplitStrategy {
NORMAL, LLM, QA
}
}
}

View File

@@ -1,34 +0,0 @@
package com.lanyuanxiaoyao.service.ai.knowledge.reader;
import cn.hutool.core.util.StrUtil;
import java.util.List;
import java.util.stream.Stream;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.core.io.Resource;
/**
* @author lanyuanxiaoyao
* @version 20250522
*/
public class TextLineReader extends TextReader {
public TextLineReader(Resource resource) {
super(resource);
}
@Override
public List<Document> get() {
return super.get()
.stream()
.flatMap(doc -> {
String text = doc.getText();
if (StrUtil.isBlank(text)) {
return Stream.of(doc);
}
return Stream.of(text.split("\n\n"))
.filter(StrUtil::isNotBlank)
.map(line -> new Document(line, doc.getMetadata()));
})
.toList();
}
}

View File

@@ -1,7 +1,13 @@
package com.lanyuanxiaoyao.service.ai.knowledge.service;
import com.lanyuanxiaoyao.service.ai.knowledge.entity.EmbeddingContext;
import com.yomahub.liteflow.core.FlowExecutor;
import java.nio.file.Path;
import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.list.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.stereotype.Service;
/**
@@ -12,5 +18,19 @@ import org.springframework.stereotype.Service;
public class EmbeddingService {
private static final Logger logger = LoggerFactory.getLogger(EmbeddingService.class);
private final FlowExecutor executor;
@SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection")
public EmbeddingService(FlowExecutor executor) {
this.executor = executor;
}
public ImmutableList<Document> split(String mode, String content) {
EmbeddingContext context = new EmbeddingContext(
content,
new EmbeddingContext.Config(EmbeddingContext.Config.SplitStrategy.valueOf(mode))
);
executor.execute2Resp("embedding", null, context);
return Lists.immutable.ofAll(context.getDocuments());
}
}

View File

@@ -0,0 +1,201 @@
package com.lanyuanxiaoyao.service.ai.knowledge.service.node;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Assert;
import cn.hutool.core.util.StrUtil;
import com.lanyuanxiaoyao.service.ai.knowledge.entity.EmbeddingContext;
import com.yomahub.liteflow.annotation.LiteflowComponent;
import com.yomahub.liteflow.annotation.LiteflowMethod;
import com.yomahub.liteflow.core.NodeComponent;
import com.yomahub.liteflow.enums.LiteFlowMethodEnum;
import com.yomahub.liteflow.enums.NodeTypeEnum;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.ExtractedTextFormatter;
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.core.io.PathResource;
/**
* @author lanyuanxiaoyao
* @version 20250523
*/
@LiteflowComponent
public class EmbeddingNodes {
private static final Logger logger = LoggerFactory.getLogger(EmbeddingNodes.class);
private final ChatClient chatClient;
public EmbeddingNodes(ChatClient.Builder builder) {
this.chatClient = builder.build();
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_BOOLEAN, nodeId = "embedding_check_if_file_needed", nodeName = "判断是否需要读取文件", nodeType = NodeTypeEnum.BOOLEAN)
public boolean checkIfFileReadNeeded(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
Assert.notNull(context, "EmbeddingContext is null");
if (StrUtil.isNotBlank(context.getFile())) {
Assert.isTrue(FileUtil.exist(context.getFile()), "File [{}] not exist", context.getFile());
return true;
}
Assert.notBlank(context.getContent(), "Contents is empty");
return false;
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "test_print", nodeType = NodeTypeEnum.COMMON)
public void testPrint(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
logger.info(context.getContent());
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_SWITCH, nodeId = "file_reader_switch", nodeName = "判断文件格式", nodeType = NodeTypeEnum.SWITCH)
public String fileReaderSwitch(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
String extName = FileUtil.extName(context.getFile());
return switch (extName.toLowerCase()) {
case "txt", "md", "markdown" -> "txt_file_reader";
case "pdf" -> "pdf_file_reader";
case "doc", "docx", "xls", "xlsx", "ppt", "pptx", "html", "xml", "wps", "et", "dpt" -> "any_file_reader";
default -> throw new IllegalStateException("Unsupported ext: " + extName);
};
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "txt_file_reader", nodeName = "读取文本文件", nodeType = NodeTypeEnum.COMMON)
public void txtFileReader(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
context.setContent(FileUtil.readString(context.getFile(), Charset.defaultCharset()));
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "pdf_file_reader", nodeName = "读取pdf文件", nodeType = NodeTypeEnum.COMMON)
public void pdfFileReader(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
PagePdfDocumentReader reader = new PagePdfDocumentReader(
new PathResource(context.getFile()),
PdfDocumentReaderConfig.builder()
.withPageTopMargin(0)
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
.withNumberOfTopTextLinesToDelete(0)
.build())
.build());
context.setContent(readBySpringAiReader(reader));
}
/**
* <a href="https://tika.apache.org/3.1.0/formats.html#Database_formats">Tika支持的格式</a>
*/
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "any_file_reader", nodeName = "使用Tika尝试读取文件", nodeType = NodeTypeEnum.COMMON)
public void anyFileReader(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
context.setContent(readBySpringAiReader(new TikaDocumentReader(new PathResource(context.getFile()))));
}
private String readBySpringAiReader(DocumentReader reader) {
return reader.get()
.stream()
.map(Document::getText)
.collect(Collectors.joining("\n"))
.trim();
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS_SWITCH, nodeId = "split_switch", nodeName = "判断使用什么分段方法", nodeType = NodeTypeEnum.SWITCH)
public String splitSwitch(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
return switch (context.getConfig().getSplitStrategy()) {
case NORMAL -> "normal_split";
case LLM -> "llm_split";
case QA -> "qa_split";
};
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "normal_split", nodeName = "使用普通分段", nodeType = NodeTypeEnum.COMMON)
public void normalSplit(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
TokenTextSplitter splitter = new TokenTextSplitter(200, 100, 5, 200, true);
Document document = Document.builder()
.text(context.getContent())
.build();
context.setDocuments(splitter.split(document));
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "llm_split", nodeName = "使用大模型分段", nodeType = NodeTypeEnum.COMMON)
public void llmSplit(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
context.getDocuments().addAll(llmSplit(
"""
对用户输入的文本,生成高质量的分段。请遵循以下指南:
1. 分段原则:
分段按文本内容的语义进行分割,每个分段都尽可能保持完整连续的内容表达。
避免从词句的中间进行分割。
2. 格式:
分段之间用两个空行分隔,以提高可读性。
避免使用任何Markdown格式
3. 内容要求:
确保每个分段的内容文字完全依照原文。
避免添加任何原文中不存在的文字。
""",
context.getContent(),
context.getMetadata()
));
}
@LiteflowMethod(value = LiteFlowMethodEnum.PROCESS, nodeId = "qa_split", nodeName = "使用Q/A格式分段", nodeType = NodeTypeEnum.COMMON)
public void qaSplit(NodeComponent node) {
EmbeddingContext context = node.getContextBean(EmbeddingContext.class);
context.getDocuments().addAll(llmSplit(
"""
对用户输入的文本,生成一组高质量的问答对。请遵循以下指南:
1. 问题部分:
为同一个主题创建尽可能多的不同表述的问题,确保问题的多样性。
每个问题应考虑用户可能的多种问法,例如:
直接询问(如“什么是...?”)
请求确认(如“是否可以说...?”)
寻求解释(如“请解释一下...的含义。”)
假设性问题(如“如果...会怎样?”)
例子请求(如“能否举个例子说明...?”)
问题应涵盖文本中的关键信息、主要概念和细节,确保不遗漏重要内容。
2. 答案部分:
提供一个全面、信息丰富的答案,涵盖问题的所有可能角度,确保逻辑连贯。
答案应直接基于给定文本,确保准确性和一致性。
包含相关的细节,如日期、名称、职位等具体信息,必要时提供背景信息以增强理解。
3. 格式:
使用"Q:"标记问题集合的开始,所有问题应在一个段落内,问题之间用空格分隔。
使用"A:"标记答案的开始,答案应清晰分段,便于阅读。
问答对之间用两个空行分隔,以提高可读性。
避免使用任何Markdown格式
4. 内容要求:
确保问答对紧密围绕文本主题,避免偏离主题。
避免添加文本中未提及的信息,确保信息的真实性。
如果文本信息不足以回答某个方面,可以在答案中说明 "根据给定信息无法确定",并尽量提供相关的上下文。
""",
context.getContent(),
context.getMetadata()
));
}
private List<Document> llmSplit(String prompt, String content, Map<String, Object> metadata) {
String response = chatClient.prompt()
.system(prompt)
.user(content)
.call()
.content();
Assert.notBlank(response, "LLM response is empty");
// noinspection DataFlowIssue
return Arrays.stream(StrUtil.trim(response).split("(s?)\\s*\\n\\n"))
.map(StrUtil::trim)
.map(text -> Document.builder()
.text(text)
.metadata(metadata)
.build())
.toList();
}
}

View File

@@ -33,7 +33,7 @@ spring:
api-key: ENC(K+Hff9QGC+fcyi510VIDd9CaeK/IN5WBJ9rlkUsHEdDgIidW+stHHJlsK0lLPUXXREha+ToQZqqDXJrqSE+GUKCXklFhelD8bRHFXBIeP/ZzT2cxhzgKUXgjw3S0Qw2R)
chat:
options:
model: 'Qwen3-1.7'
model: 'Qwen3-1.7-vllm'
embedding:
options:
model: 'Bge-m3'

View File

@@ -1,14 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE flow PUBLIC "liteflow" "https://liteflow.cc/liteflow.dtd">
<flow>
<chain name="embedding">
<chain id="embedding">
SER(
embedding_start,
SWITCH(embedding_mode_switch).TO(
normal_embedding,
llm_embedding,
qa_embedding
IF(
embedding_check_if_file_needed,
SWITCH(file_reader_switch).TO(
txt_file_reader,
pdf_file_reader
).DEFAULT(any_file_reader)
),
embedding_finish
);
SWITCH(split_switch).TO(
normal_split,
llm_split,
qa_split
)
)
</chain>
</flow>

View File

@@ -33,21 +33,19 @@ const DataImport: React.FC = () => {
name: 'mode',
type: 'radios',
label: '解析模式',
value: 'normal',
value: 'NORMAL',
options: [
{
value: 'normal',
value: 'NORMAL',
label: '常规模式',
},
{
value: 'llm',
value: 'LLM',
label: '智能模式',
disabled: true,
},
{
value: 'qa',
value: 'QA',
label: 'Q/A模式',
disabled: true,
},
],
},
@@ -105,7 +103,6 @@ const DataImport: React.FC = () => {
},
dataType: 'form',
data: {
name: name,
mode: '${mode}',
type: '${type}',
content: '${content}',