Compare commits
38 Commits
b80c635f07
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| d3fd6de965 | |||
| 277c14d2e8 | |||
| 5cc347589b | |||
| 89ffc88082 | |||
| 675235f5b3 | |||
| a490b2642c | |||
| 1306dd5971 | |||
| e0c6ed1638 | |||
| 0dd7aa221c | |||
| 3b2b368db2 | |||
| a578c0b7ac | |||
| 78063b9e07 | |||
| edbdeec90d | |||
| a5c0b67360 | |||
| 82b09614d3 | |||
| c90e1c98be | |||
| 229f17bfee | |||
| e67ec24dfd | |||
| aa1f0a9e94 | |||
| a8af3cc6c4 | |||
| 65c746c639 | |||
| fad0edc46a | |||
| 725b91374f | |||
| cf10458dd6 | |||
| e53e64d386 | |||
| 688933c228 | |||
| 25d748aa17 | |||
| 9abc0a0707 | |||
| aaa1171e60 | |||
| 9daff73589 | |||
| 6e75c99d5b | |||
| d860e17b2c | |||
| c140bda66b | |||
| dfe6904f4c | |||
| b2fb418a06 | |||
| 58093e0877 | |||
| 47038475d4 | |||
| 1aea561277 |
@@ -1,7 +1,6 @@
|
|||||||
{
|
{
|
||||||
"permissions": {
|
"permissions": {
|
||||||
"allow": [
|
"allow": [
|
||||||
"WebSearch",
|
|
||||||
"WebFetch(*)",
|
"WebFetch(*)",
|
||||||
"Bash(openspec:*)",
|
"Bash(openspec:*)",
|
||||||
"Bash(git:*)",
|
"Bash(git:*)",
|
||||||
@@ -10,8 +9,12 @@
|
|||||||
"Bash(wc:*)",
|
"Bash(wc:*)",
|
||||||
"Bash(curl:*)",
|
"Bash(curl:*)",
|
||||||
"mcp__context7__query-docs",
|
"mcp__context7__query-docs",
|
||||||
|
"mcp__context7__resolve-library-id",
|
||||||
"mcp__exa__web_search_exa",
|
"mcp__exa__web_search_exa",
|
||||||
"mcp__exa__get_code_context_exa"
|
"mcp__exa__get_code_context_exa"
|
||||||
|
],
|
||||||
|
"deny": [
|
||||||
|
"WebSearch"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
12
.gitattributes
vendored
12
.gitattributes
vendored
@@ -1,13 +1,3 @@
|
|||||||
# Git LFS 配置
|
# Git LFS 配置
|
||||||
# 追踪大型二进制测试文件
|
# 追踪大型二进制测试文件
|
||||||
# PDF 文件
|
tests/test_readers/fixtures/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
tests/fixtures/documents/**/*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
||||||
# Office 文档(可选,根据需要启用)
|
|
||||||
tests/fixtures/documents/**/*.docx filter=lfs diff=lfs merge=lfs -text
|
|
||||||
tests/fixtures/documents/**/*.xlsx filter=lfs diff=lfs merge=lfs -text
|
|
||||||
tests/fixtures/documents/**/*.pptx filter=lfs diff=lfs merge=lfs -text
|
|
||||||
# 图片文件
|
|
||||||
tests/fixtures/documents/**/*.png filter=lfs diff=lfs merge=lfs -text
|
|
||||||
tests/fixtures/documents/**/*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
||||||
tests/fixtures/documents/**/*.jpeg filter=lfs diff=lfs merge=lfs -text
|
|
||||||
tests/fixtures/documents/**/*.gif filter=lfs diff=lfs merge=lfs -text
|
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -174,6 +174,9 @@ ipython_config.py
|
|||||||
# pipenv
|
# pipenv
|
||||||
Pipfile.lock
|
Pipfile.lock
|
||||||
|
|
||||||
|
# uv
|
||||||
|
uv.lock
|
||||||
|
|
||||||
# PEP 582
|
# PEP 582
|
||||||
__pypackages__/
|
__pypackages__/
|
||||||
|
|
||||||
|
|||||||
271
README.md
271
README.md
@@ -1,123 +1,230 @@
|
|||||||
# lyxy-document
|
# lyxy-document
|
||||||
|
|
||||||
统一文档解析工具 - 将 DOCX、XLSX、PPTX、PDF、HTML/URL 转换为 Markdown
|
统一文档解析工具 - 将 DOC、DOCX、XLS、XLSX、PPT、PPTX、PDF、HTML/URL 转换为 Markdown
|
||||||
|
|
||||||
|
## 项目概述
|
||||||
|
|
||||||
|
面向 AI Skill 的统一文档解析工具,支持多种文档格式解析为 Markdown,提供全文输出、字数统计、标题提取、内容搜索等功能。
|
||||||
|
|
||||||
## 开发环境
|
## 开发环境
|
||||||
|
|
||||||
- 使用 uv 管理依赖,禁用主机 Python
|
- 使用 uv 运行脚本和测试,禁用主机 Python
|
||||||
- 依赖声明:pyproject.toml
|
- 依赖管理:使用 `uv run --with` 按需加载依赖
|
||||||
- 安装:uv sync
|
- 自启动机制:脚本自动检测依赖并用正确的 uv 命令执行
|
||||||
|
|
||||||
## 项目结构
|
## 项目架构
|
||||||
|
|
||||||
```
|
```
|
||||||
scripts/ # 核心代码
|
scripts/
|
||||||
├── core/ # 核心模块(解析调度、异常、Markdown 工具)
|
├── lyxy_document_reader.py # CLI 入口(自启动)
|
||||||
|
├── bootstrap.py # 实际执行模块
|
||||||
|
├── config.py # 配置(含 DEPENDENCIES 依赖配置)
|
||||||
|
├── core/ # 核心模块
|
||||||
|
│ ├── parser.py # 解析调度
|
||||||
|
│ ├── advice_generator.py # 依赖检测和配置生成
|
||||||
|
│ ├── markdown.py # Markdown 工具
|
||||||
|
│ └── exceptions.py # 异常定义
|
||||||
├── readers/ # 格式阅读器
|
├── readers/ # 格式阅读器
|
||||||
|
│ ├── base.py # Reader 基类
|
||||||
|
│ ├── doc/ # DOC 解析器(旧格式)
|
||||||
|
│ ├── docx/ # DOCX 解析器
|
||||||
|
│ ├── xls/ # XLS 解析器(旧格式)
|
||||||
|
│ ├── xlsx/ # XLSX 解析器
|
||||||
|
│ ├── ppt/ # PPT 解析器(旧格式)
|
||||||
|
│ ├── pptx/ # PPTX 解析器
|
||||||
|
│ ├── pdf/ # PDF 解析器
|
||||||
|
│ └── html/ # HTML/URL 解析器
|
||||||
└── utils/ # 工具函数
|
└── utils/ # 工具函数
|
||||||
tests/ # 测试
|
├── file_detection.py # 文件检测
|
||||||
openspec/ # 规范文档
|
└── encoding_detection.py # 编码检测
|
||||||
skill/ # SKILL 文档
|
|
||||||
|
tests/ # 测试套件
|
||||||
|
├── test_readers/ # Reader 测试
|
||||||
|
│ └── fixtures/ # 静态测试文件(Git LFS 管理)
|
||||||
|
│ └── xls/ # XLS 旧格式测试文件
|
||||||
|
openspec/ # OpenSpec 规范文档
|
||||||
|
build.py # 构建脚本(混淆模式)
|
||||||
|
publish.py # 发布脚本
|
||||||
|
publish.sh # 一键构建+发布
|
||||||
|
README.md # 本文档(开发者文档)
|
||||||
|
SKILL.md # AI Skill 文档
|
||||||
```
|
```
|
||||||
|
|
||||||
## 开发工作流
|
## 测试 Fixtures 规范
|
||||||
|
|
||||||
|
### 静态测试文件目录
|
||||||
|
|
||||||
|
`tests/test_readers/fixtures/` 目录用于存放**预先准备的静态测试文件**,特别是难以通过 Python 自动化创建的旧格式文件(.xls)。
|
||||||
|
|
||||||
|
### 目录使用规则
|
||||||
|
|
||||||
|
1. **仅存放静态文件**:该目录下的文件必须是预先准备好的,禁止在测试运行时向该目录动态生成临时文件。
|
||||||
|
2. **临时文件使用 tmp_path**:测试中需要临时文件时,使用 pytest 的 `tmp_path` fixture 在其他位置创建。
|
||||||
|
3. **Git LFS 管理**:该目录下所有文件通过 Git LFS 管理,见 `.gitattributes` 配置。
|
||||||
|
|
||||||
|
### Fixture 说明
|
||||||
|
|
||||||
|
`tests/test_readers/conftest.py` 提供以下静态文件 fixtures:
|
||||||
|
|
||||||
|
- 目录路径:`xls_fixture_path`
|
||||||
|
- 单个文件:`simple_xls_path` 等
|
||||||
|
|
||||||
|
文件不存在时会自动 `pytest.skip()`,保证 CI 稳定性。
|
||||||
|
|
||||||
|
## 核心概念
|
||||||
|
|
||||||
|
### Reader 机制
|
||||||
|
|
||||||
|
每种文档格式对应一个 Reader 包,包含多个解析实现。Reader 基类定义 `supports()` 和 `parse()` 方法,解析器按顺序尝试,第一个成功的结果返回。
|
||||||
|
|
||||||
|
### 依赖配置 (config.DEPENDENCIES)
|
||||||
|
|
||||||
|
按文件类型和平台组织依赖配置:
|
||||||
|
|
||||||
|
```python
|
||||||
|
DEPENDENCIES = {
|
||||||
|
"pdf": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": ["docling", "unstructured[pdf]", ...]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": ["docling==2.40.0", ...]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 自启动机制
|
||||||
|
|
||||||
|
入口脚本根据文件扩展名识别类型,检测当前平台,从 `config.DEPENDENCIES` 读取对应配置,自动生成并执行正确的 `uv run --with` 命令。
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 验证环境
|
||||||
|
|
||||||
|
首先验证项目可以正常运行:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 运行测试
|
# 测试解析功能(自动检测依赖并执行)
|
||||||
uv run pytest
|
python scripts/lyxy_document_reader.py "https://example.com"
|
||||||
|
|
||||||
# 运行测试并查看覆盖率
|
|
||||||
uv run pytest --cov=scripts --cov-report=term-missing
|
|
||||||
|
|
||||||
# 运行特定测试文件
|
|
||||||
uv run pytest tests/test_readers/test_docx/
|
|
||||||
|
|
||||||
# 运行特定测试类或方法
|
|
||||||
uv run pytest tests/test_cli/test_main.py::TestCLIDefaultOutput::test_default_output_docx
|
|
||||||
|
|
||||||
# 代码格式化
|
|
||||||
uv run black .
|
|
||||||
uv run isort .
|
|
||||||
|
|
||||||
# 类型检查
|
|
||||||
uv run mypy .
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 测试
|
### 运行基础测试
|
||||||
|
|
||||||
项目包含完整的测试套件,覆盖 CLI 和所有 Reader 实现:
|
|
||||||
|
|
||||||
- **测试覆盖率**: 69%
|
|
||||||
- **测试数量**: 193 个测试
|
|
||||||
- **测试类型**:
|
|
||||||
- CLI 功能测试(字数统计、行数统计、标题提取、搜索等)
|
|
||||||
- Reader 解析测试(DOCX、PDF、HTML、PPTX、XLSX)
|
|
||||||
- 多 Reader 实现测试(每种格式测试多个解析库)
|
|
||||||
- 异常场景测试(文件不存在、空文件、损坏文件、特殊字符)
|
|
||||||
- 编码测试(GBK、UTF-8 BOM 等)
|
|
||||||
- 一致性测试(验证不同 Reader 解析结果的一致性)
|
|
||||||
|
|
||||||
运行测试前确保已安装所有依赖:
|
|
||||||
```bash
|
```bash
|
||||||
uv sync
|
# 使用 run_tests.py 自动加载依赖并运行测试
|
||||||
|
python run_tests.py cli -v
|
||||||
```
|
```
|
||||||
|
|
||||||
## 代码规范
|
## 开发指南
|
||||||
|
|
||||||
|
### 测试前置依赖说明
|
||||||
|
|
||||||
|
由于 `HtmlReader` 模块在导入时会加载 `cleaner.py`,但 `cleaner.py` 中的第三方库已改为动态导入,因此无需额外依赖。
|
||||||
|
|
||||||
|
`beautifulsoup4` 和 `chardet` 仅在实际使用 HTML 功能时才需要,模块导入时不依赖。
|
||||||
|
|
||||||
|
### 如何添加新的 Reader
|
||||||
|
|
||||||
|
1. 在 `scripts/readers/` 下创建新目录
|
||||||
|
2. 继承 `BaseReader` 实现 `supports()` 和 `parse()`
|
||||||
|
3. 在 `scripts/readers/__init__.py` 中注册
|
||||||
|
4. 在 `config.DEPENDENCIES` 中添加依赖配置
|
||||||
|
|
||||||
|
### 如何测试
|
||||||
|
|
||||||
|
项目包含完整的测试套件,覆盖 CLI、核心模块、工具函数和所有 Reader 实现。使用 `run_tests.py` 自动加载对应依赖并运行测试。
|
||||||
|
|
||||||
|
#### 测试目录结构
|
||||||
|
- tests/test_cli/ - CLI 功能测试
|
||||||
|
- tests/test_core/ - 核心模块测试(markdown, parser, advice_generator)
|
||||||
|
- tests/test_readers/ - 各格式 Reader 测试
|
||||||
|
- tests/test_utils/ - 工具函数测试(file_detection, encoding_detection)
|
||||||
|
|
||||||
|
#### run_tests.py 使用说明
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看帮助
|
||||||
|
python run_tests.py -h
|
||||||
|
|
||||||
|
# 运行所有测试
|
||||||
|
python run_tests.py all
|
||||||
|
|
||||||
|
# 运行特定类型测试
|
||||||
|
python run_tests.py pdf
|
||||||
|
python run_tests.py docx
|
||||||
|
python run_tests.py xlsx
|
||||||
|
python run_tests.py pptx
|
||||||
|
python run_tests.py html
|
||||||
|
python run_tests.py xls
|
||||||
|
python run_tests.py doc
|
||||||
|
python run_tests.py ppt
|
||||||
|
python run_tests.py cli
|
||||||
|
python run_tests.py core
|
||||||
|
python run_tests.py utils
|
||||||
|
|
||||||
|
# 透传 pytest 参数
|
||||||
|
python run_tests.py pdf -v
|
||||||
|
python run_tests.py pdf --cov=scripts
|
||||||
|
python run_tests.py pdf tests/test_readers/test_pdf/test_docling_pdf.py
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 查看测试覆盖率
|
||||||
|
```bash
|
||||||
|
python run_tests.py all --with pytest-cov --cov=scripts --cov-report=term-missing
|
||||||
|
```
|
||||||
|
|
||||||
|
### 代码规范
|
||||||
|
|
||||||
- 语言:仅中文(交流、注释、文档、代码)
|
- 语言:仅中文(交流、注释、文档、代码)
|
||||||
- 模块文件:150-300 行
|
- 模块文件:150-300 行
|
||||||
- 错误处理:自定义异常 + 清晰信息 + 位置上下文
|
- 错误处理:自定义异常 + 清晰信息 + 位置上下文
|
||||||
- Git 提交:类型: 简短描述(feat/fix/refactor/docs/style/test/chore)
|
- Git 提交:`类型: 简短描述`(feat/fix/refactor/docs/style/test/chore)
|
||||||
|
|
||||||
## Skill 文档规范
|
## 构建与发布
|
||||||
|
|
||||||
skill/SKILL.md 面向 AI 用户,必须遵循 Claude Skill 构建指南的最佳实践:
|
### 构建脚本
|
||||||
|
|
||||||
### YAML frontmatter
|
项目提供 `build.py` 用于构建 Skill 包,使用 PyArmor 进行代码混淆:
|
||||||
|
|
||||||
- **name**: kebab-case 格式
|
```bash
|
||||||
- **description**: 包含功能说明、触发词、文件类型、典型任务
|
uv run --with pyarmor python build.py
|
||||||
- **license**: MIT
|
```
|
||||||
- **metadata**: 包含 version、author
|
|
||||||
- **compatibility**: 说明 Python 版本要求和依赖情况
|
|
||||||
|
|
||||||
### 文档章节结构
|
构建产物输出到 `build/` 目录,包含:
|
||||||
|
- `SKILL.md`(动态注入 version 和 author)
|
||||||
|
- `scripts/`(混淆后的代码)
|
||||||
|
|
||||||
1. **Purpose**: 说明统一入口和双路径执行策略
|
### 发布脚本
|
||||||
2. **When to Use**: 典型场景和触发词列表(中英文、文件扩展名)
|
|
||||||
3. **Quick Reference**: 命令参数表格
|
|
||||||
4. **Workflow**: 4 步工作流程(检测环境、识别类型、执行解析、输出结果)
|
|
||||||
5. **使用示例**: 各文档类型的基本用法和高级用法
|
|
||||||
6. **错误处理**: 常见错误及解决方案
|
|
||||||
7. **References**: 指向项目文档的链接
|
|
||||||
|
|
||||||
### 双路径执行策略
|
提供 `publish.py` 用于自动发布到目标仓库:
|
||||||
|
|
||||||
- **优先**: 使用 lyxy-runner-python skill(自动管理依赖)
|
```bash
|
||||||
- **回退**: 主机 Python 环境(需手动安装依赖)
|
uv run python publish.py
|
||||||
|
```
|
||||||
|
|
||||||
### 依赖说明
|
发布流程:
|
||||||
|
1. 在临时目录 clone `https://github.com/lanyuanxiaoyao/skills.git`(--depth 1)
|
||||||
|
2. 清空 `skills/lyxy-document-reader/` 目录
|
||||||
|
3. 复制 `build/` 内容到目标路径
|
||||||
|
4. Git 提交并推送
|
||||||
|
|
||||||
- 必须使用具体的 pip 包名
|
### 一键发布
|
||||||
- 不能使用 lyxy-document[xxx] 形式(发布时没有 pyproject.toml)
|
|
||||||
- 按文档类型分组说明
|
|
||||||
|
|
||||||
## 解析器架构
|
使用 `publish.sh` 一键完成构建+发布:
|
||||||
|
|
||||||
### DOCX
|
```bash
|
||||||
docling、unstructured、pypandoc-binary、MarkItDown、python-docx、XML
|
./publish.sh
|
||||||
|
```
|
||||||
|
|
||||||
### XLSX
|
## 文档说明
|
||||||
docling、unstructured、MarkItDown、pandas、XML
|
|
||||||
|
|
||||||
### PPTX
|
- **README.md**(本文档):面向项目开发者
|
||||||
docling、unstructured、MarkItDown、python-pptx、XML
|
- **SKILL.md**:面向 AI 使用的 Skill 文档
|
||||||
|
- **openspec/**:OpenSpec 规范文档
|
||||||
### PDF(OCR 优先)
|
|
||||||
docling OCR、unstructured OCR、docling、unstructured、MarkItDown、pypdf
|
|
||||||
|
|
||||||
### HTML/URL
|
|
||||||
trafilatura、domscribe、MarkItDown、html2text
|
|
||||||
|
|
||||||
## 许可证
|
## 许可证
|
||||||
|
|
||||||
|
|||||||
91
SKILL.md
Normal file
91
SKILL.md
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
---
|
||||||
|
name: lyxy-document-reader
|
||||||
|
description: 统一文档解析工具 - 将 DOC、DOCX、XLS、XLSX、PPT、PPTX、PDF、HTML/URL 转换为 Markdown。支持全文输出、字数统计、行数统计、标题提取、章节提取、正则搜索。当用户要求"读取/解析/打开文档"、上传 .doc/.docx/.xls/.xlsx/.ppt/.pptx/.pdf/.html 文件、或提供 URL 时使用。
|
||||||
|
license: MIT
|
||||||
|
compatibility: Requires Python 3.11+。脚本自启动,自动检测依赖并使用 uv 执行。
|
||||||
|
---
|
||||||
|
|
||||||
|
# 统一文档解析 Skill
|
||||||
|
|
||||||
|
## 推荐用法
|
||||||
|
|
||||||
|
直接运行脚本即可,它会自动检测文件类型、当前平台,并用正确的 uv 命令执行:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/lyxy_document_reader.py <文件路径或URL>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
**支持格式**
|
||||||
|
- DOC(Word 旧格式)
|
||||||
|
- DOCX(Word 文档)
|
||||||
|
- XLS(Excel 旧格式)
|
||||||
|
- XLSX(Excel 表格)
|
||||||
|
- PPT(PowerPoint 旧格式)
|
||||||
|
- PPTX(PowerPoint 演示文稿)
|
||||||
|
- PDF(PDF 文档,支持 OCR)
|
||||||
|
- HTML / URL(网页内容)
|
||||||
|
|
||||||
|
## When to Use
|
||||||
|
|
||||||
|
### 触发场景
|
||||||
|
- 文档转换:将各类文档转为 Markdown
|
||||||
|
- 文档元数据:字数、行数统计
|
||||||
|
- 标题分析:提取标题结构
|
||||||
|
- 章节提取:提取特定章节
|
||||||
|
- 内容搜索:关键词或正则搜索
|
||||||
|
|
||||||
|
### 触发词
|
||||||
|
- 中文:"读取/解析/打开 文档/Word/Excel/PPT/PDF/网页"
|
||||||
|
- 英文:"read/parse/extract document/doc/docx/xls/xlsx/ppt/pptx/pdf/html"
|
||||||
|
- 文件扩展名:`.doc`、`.docx`、`.xls`、`.xlsx`、`.ppt`、`.pptx`、`.pdf`、`.html`、`.htm`
|
||||||
|
- URL:`http://`、`https://`
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
| 参数 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| (无) | 输出完整 Markdown |
|
||||||
|
| `-c/--count` | 字数统计 |
|
||||||
|
| `-l/--lines` | 行数统计 |
|
||||||
|
| `-t/--titles` | 提取所有标题(1-6级) |
|
||||||
|
| `-tc <name>` | 提取指定标题的章节内容 |
|
||||||
|
| `-s <pattern>` | 正则表达式搜索 |
|
||||||
|
| `-n <num>/--context <num>` | 与 `-s` 配合,指定上下文行数(默认 2) |
|
||||||
|
|
||||||
|
## 参数使用示例
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 读取全文(自动检测依赖)
|
||||||
|
python scripts/lyxy_document_reader.py document.docx
|
||||||
|
|
||||||
|
# 统计字数
|
||||||
|
python scripts/lyxy_document_reader.py document.docx -c
|
||||||
|
|
||||||
|
# 提取标题
|
||||||
|
python scripts/lyxy_document_reader.py document.docx -t
|
||||||
|
|
||||||
|
# 提取指定章节
|
||||||
|
python scripts/lyxy_document_reader.py document.docx -tc "第三章"
|
||||||
|
|
||||||
|
# 搜索内容
|
||||||
|
python scripts/lyxy_document_reader.py document.docx -s "关键词"
|
||||||
|
|
||||||
|
# 正则搜索
|
||||||
|
python scripts/lyxy_document_reader.py document.docx -s "\d{4}-\d{2}-\d{2}"
|
||||||
|
|
||||||
|
# 指定搜索上下文行数
|
||||||
|
python scripts/lyxy_document_reader.py document.docx -s "关键词" -n 5
|
||||||
|
```
|
||||||
|
|
||||||
|
## 错误处理
|
||||||
|
|
||||||
|
| 错误 | 原因 | 解决 |
|
||||||
|
|------|------|------|
|
||||||
|
| 错误: input_path 不能为空 | 未提供输入 | 提供 file_path 或 URL |
|
||||||
|
| 错误: 不支持的文件类型 | 无对应 reader | 检查文件扩展名 |
|
||||||
|
| 所有解析方法均失败 | 所有解析器失败 | 检查文件是否损坏 |
|
||||||
|
| 错误: 无效的正则表达式 | 正则语法错误 | 检查正则语法 |
|
||||||
|
| 错误: 未找到匹配 | 搜索无结果 | 检查搜索词或正则 |
|
||||||
|
| ModuleNotFoundError | 缺少依赖 | 脚本会自动检测并安装依赖 |
|
||||||
304
build.py
Normal file
304
build.py
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Skill 打包构建脚本(混淆模式)
|
||||||
|
|
||||||
|
使用方式:
|
||||||
|
uv run --with pyarmor python build.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def generate_timestamp() -> str:
|
||||||
|
"""
|
||||||
|
生成 YYYYMMDD_HHMMSS 格式的时间戳
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
时间戳字符串
|
||||||
|
"""
|
||||||
|
return datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_config(key: str) -> str:
|
||||||
|
"""
|
||||||
|
读取 git 配置项
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key: 配置项名称,如 "user.name"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
配置值字符串
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
subprocess.CalledProcessError: git config 命令失败
|
||||||
|
"""
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "config", "--get", key],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_user_info() -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
读取 git user.name 和 user.email
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(name, email) 元组
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
SystemExit: git 配置未设置时退出
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
name = get_git_config("user.name")
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
print("错误: git user.name 未设置")
|
||||||
|
print("请先配置 git 用户名:")
|
||||||
|
print(' git config --global user.name "Your Name"')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
email = get_git_config("user.email")
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
print("错误: git user.email 未设置")
|
||||||
|
print("请先配置 git 邮箱:")
|
||||||
|
print(' git config --global user.email "your@email.com"')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return name, email
|
||||||
|
|
||||||
|
|
||||||
|
def clean_and_create_build_dir(build_dir: str) -> None:
|
||||||
|
"""
|
||||||
|
删除旧 build 目录并创建新的空目录
|
||||||
|
|
||||||
|
Args:
|
||||||
|
build_dir: 构建目录路径
|
||||||
|
"""
|
||||||
|
if os.path.exists(build_dir):
|
||||||
|
shutil.rmtree(build_dir)
|
||||||
|
os.makedirs(build_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def copy_skill_md(source_path: str, target_dir: str, version: str, author: str) -> None:
|
||||||
|
"""
|
||||||
|
读取 SKILL.md 模板,动态注入 version 和 author 后写入 build/SKILL.md
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_path: 源 SKILL.md 路径
|
||||||
|
target_dir: 目标目录
|
||||||
|
version: 版本号
|
||||||
|
author: 作者信息 (格式: "Name <email>")
|
||||||
|
"""
|
||||||
|
target_path = os.path.join(target_dir, "SKILL.md")
|
||||||
|
|
||||||
|
with open(source_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
lines = content.split("\n")
|
||||||
|
|
||||||
|
# 解析 frontmatter
|
||||||
|
frontmatter_start = -1
|
||||||
|
frontmatter_end = -1
|
||||||
|
frontmatter_count = 0
|
||||||
|
has_metadata = False
|
||||||
|
metadata_idx = -1
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
stripped = line.rstrip()
|
||||||
|
if stripped == "---":
|
||||||
|
frontmatter_count += 1
|
||||||
|
if frontmatter_count == 1:
|
||||||
|
frontmatter_start = i
|
||||||
|
elif frontmatter_count == 2:
|
||||||
|
frontmatter_end = i
|
||||||
|
break
|
||||||
|
elif frontmatter_count == 1 and stripped == "metadata:":
|
||||||
|
has_metadata = True
|
||||||
|
metadata_idx = i
|
||||||
|
|
||||||
|
result_lines = []
|
||||||
|
|
||||||
|
if frontmatter_start >= 0 and frontmatter_end > frontmatter_start:
|
||||||
|
# 有 frontmatter
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
if i < frontmatter_start or i >= frontmatter_end:
|
||||||
|
result_lines.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
# 在 frontmatter 内部
|
||||||
|
if has_metadata and i == metadata_idx:
|
||||||
|
# 找到 metadata: 行
|
||||||
|
result_lines.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
version_written = False
|
||||||
|
author_written = False
|
||||||
|
# 遍历 metadata 子项,替换 version/author,保留其他
|
||||||
|
while i < frontmatter_end:
|
||||||
|
stripped_line = lines[i].rstrip()
|
||||||
|
if stripped_line.startswith(" version:"):
|
||||||
|
if not version_written:
|
||||||
|
result_lines.append(f" version: \"{version}\"")
|
||||||
|
version_written = True
|
||||||
|
i += 1
|
||||||
|
elif stripped_line.startswith(" author:"):
|
||||||
|
if not author_written:
|
||||||
|
result_lines.append(f" author: \"{author}\"")
|
||||||
|
author_written = True
|
||||||
|
i += 1
|
||||||
|
elif stripped_line.startswith(" "):
|
||||||
|
# 其他 metadata 子项,保留
|
||||||
|
result_lines.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
# metadata 块结束
|
||||||
|
break
|
||||||
|
# 确保 version/author 都写了
|
||||||
|
if not version_written:
|
||||||
|
result_lines.append(f" version: \"{version}\"")
|
||||||
|
if not author_written:
|
||||||
|
result_lines.append(f" author: \"{author}\"")
|
||||||
|
elif not has_metadata and i == frontmatter_end - 1:
|
||||||
|
# 没有 metadata,在 frontmatter 末尾插入
|
||||||
|
result_lines.append("metadata:")
|
||||||
|
result_lines.append(f" version: \"{version}\"")
|
||||||
|
result_lines.append(f" author: \"{author}\"")
|
||||||
|
result_lines.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
result_lines.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
# 没有 frontmatter,新建一个
|
||||||
|
result_lines.append("---")
|
||||||
|
result_lines.append("name: lyxy-document-reader")
|
||||||
|
result_lines.append("metadata:")
|
||||||
|
result_lines.append(f" version: \"{version}\"")
|
||||||
|
result_lines.append(f" author: \"{author}\"")
|
||||||
|
result_lines.append("---")
|
||||||
|
result_lines.append("")
|
||||||
|
result_lines.extend(lines)
|
||||||
|
|
||||||
|
new_content = "\n".join(result_lines)
|
||||||
|
|
||||||
|
with open(target_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(new_content)
|
||||||
|
|
||||||
|
|
||||||
|
def obfuscate_scripts_dir(source_dir: str, target_dir: str) -> None:
|
||||||
|
"""
|
||||||
|
使用 PyArmor 混淆 scripts 目录
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_dir: 源代码目录 (scripts/)
|
||||||
|
target_dir: 目标构建目录 (build/)
|
||||||
|
"""
|
||||||
|
# 检查 pyarmor 是否可用
|
||||||
|
try:
|
||||||
|
__import__("pyarmor")
|
||||||
|
except ImportError:
|
||||||
|
print("错误: PyArmor 未安装")
|
||||||
|
print("请使用以下命令:")
|
||||||
|
print(" uv run --with pyarmor python build.py")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 临时目录
|
||||||
|
temp_dir = os.path.join(target_dir, "temp_pyarmor")
|
||||||
|
|
||||||
|
# 清理已存在的临时目录
|
||||||
|
if os.path.exists(temp_dir):
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
# PyArmor 命令 (Normal Mode)
|
||||||
|
cmd = [
|
||||||
|
"pyarmor",
|
||||||
|
"gen",
|
||||||
|
"--recursive",
|
||||||
|
"-O", temp_dir,
|
||||||
|
source_dir
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
check=True,
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print("错误: PyArmor 混淆失败")
|
||||||
|
print(f" 返回码: {e.returncode}")
|
||||||
|
print(f" 标准输出: {e.stdout}")
|
||||||
|
print(f" 错误输出: {e.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 移动混淆后的文件到最终位置
|
||||||
|
scripts_dst_dir = os.path.join(target_dir, "scripts")
|
||||||
|
pyarmor_runtime_dir = None
|
||||||
|
|
||||||
|
# 先移动 scripts 目录
|
||||||
|
for item in os.listdir(temp_dir):
|
||||||
|
src = os.path.join(temp_dir, item)
|
||||||
|
if item == "scripts":
|
||||||
|
dst = os.path.join(target_dir, item)
|
||||||
|
if os.path.exists(dst):
|
||||||
|
if os.path.isdir(dst):
|
||||||
|
shutil.rmtree(dst)
|
||||||
|
else:
|
||||||
|
os.remove(dst)
|
||||||
|
shutil.move(src, dst)
|
||||||
|
elif item.startswith("pyarmor_runtime"):
|
||||||
|
pyarmor_runtime_dir = item
|
||||||
|
|
||||||
|
# 再移动 pyarmor_runtime 到 scripts 内部
|
||||||
|
if pyarmor_runtime_dir:
|
||||||
|
src = os.path.join(temp_dir, pyarmor_runtime_dir)
|
||||||
|
dst = os.path.join(scripts_dst_dir, pyarmor_runtime_dir)
|
||||||
|
if os.path.exists(dst):
|
||||||
|
if os.path.isdir(dst):
|
||||||
|
shutil.rmtree(dst)
|
||||||
|
else:
|
||||||
|
os.remove(dst)
|
||||||
|
shutil.move(src, dst)
|
||||||
|
|
||||||
|
# 清理临时目录
|
||||||
|
os.rmdir(temp_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""
|
||||||
|
主函数:执行完整的混淆打包流程
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 路径配置
|
||||||
|
project_root = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
skill_md_path = os.path.join(project_root, "SKILL.md")
|
||||||
|
scripts_source_dir = os.path.join(project_root, "scripts")
|
||||||
|
build_dir = os.path.join(project_root, "build")
|
||||||
|
|
||||||
|
# 生成版本号
|
||||||
|
version = generate_timestamp()
|
||||||
|
|
||||||
|
# 读取 git 用户信息
|
||||||
|
git_name, git_email = get_git_user_info()
|
||||||
|
author = f"{git_name} <{git_email}>"
|
||||||
|
|
||||||
|
# 清理并创建 build 目录
|
||||||
|
clean_and_create_build_dir(build_dir)
|
||||||
|
|
||||||
|
# 复制 SKILL.md(动态注入元数据)
|
||||||
|
copy_skill_md(skill_md_path, build_dir, version, author)
|
||||||
|
|
||||||
|
# 混淆代码
|
||||||
|
obfuscate_scripts_dir(scripts_source_dir, build_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
15
build.sh
Executable file
15
build.sh
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# 混淆构建脚本
|
||||||
|
#
|
||||||
|
# 使用方式:
|
||||||
|
# ./build.sh
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
echo ">>> 构建"
|
||||||
|
uv run --with pyarmor python build.py
|
||||||
|
echo ">>> 完成"
|
||||||
126
docs/upgrade-deps-prompt.md
Normal file
126
docs/upgrade-deps-prompt.md
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
# 依赖版本优化流程提示词
|
||||||
|
|
||||||
|
## 任务概述
|
||||||
|
|
||||||
|
重新梳理 `scripts/config.py` 中 `DEPENDENCIES` 的版本号和 python 版本。
|
||||||
|
|
||||||
|
## 核心原则
|
||||||
|
|
||||||
|
1. **default 的 python 版本始终使用 None**,即默认 python 版本
|
||||||
|
2. **实在需要指定 python 版本时**,才在具体的系统依赖(如 Darwin-x86_64)中指定 python 版本,而不是改 default 中的 python 版本
|
||||||
|
3. **dependencies 中的依赖都需要指定版本**
|
||||||
|
- 以当前时间点的最新版本指定
|
||||||
|
- 如果最新版本无法满足,才在指定系统依赖中探索能运行的最新依赖版本号
|
||||||
|
|
||||||
|
## 推荐流程
|
||||||
|
|
||||||
|
### 阶段 1:规范梳理
|
||||||
|
|
||||||
|
1. 确定需要检查的依赖列表
|
||||||
|
2. 确定版本查询方法(如 PyPI JSON API)
|
||||||
|
3. 确定测试验证流程
|
||||||
|
|
||||||
|
### 阶段 2:版本探索(实现阶段)
|
||||||
|
|
||||||
|
1. **先移除所有特定当前平台配置**,只保留 default
|
||||||
|
2. **default 配置使用最新版本作为标杆**
|
||||||
|
3. **逐个文件类型测试**
|
||||||
|
- 先测试 default 配置
|
||||||
|
- 若 default 失败,再添加特定平台配置并探索可运行的最新版本
|
||||||
|
4. **所有依赖(无论之前是否指定版本)都重新探索**
|
||||||
|
|
||||||
|
### 阶段 3:配置更新
|
||||||
|
|
||||||
|
1. 修改 `default.python = None`
|
||||||
|
2. 更新所有依赖到指定版本
|
||||||
|
3. 保留/调整特定平台的特殊配置
|
||||||
|
|
||||||
|
## 关键文件
|
||||||
|
|
||||||
|
- `scripts/config.py` - DEPENDENCIES 配置
|
||||||
|
- `run_tests.py` - 测试运行器(包含 TEST_FIXTURE_DEPENDENCIES)
|
||||||
|
- `openspec/changes/` - OpenSpec 变更目录
|
||||||
|
|
||||||
|
## 常用 PyPI 版本查询
|
||||||
|
|
||||||
|
使用 Python 查询 PyPI 最新版本:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
def get_latest_version(package):
|
||||||
|
try:
|
||||||
|
url = f'https://pypi.org/pypi/{package}/json'
|
||||||
|
with urllib.request.urlopen(url, timeout=15) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return data['info']['version']
|
||||||
|
except Exception as e:
|
||||||
|
return f'error: {e}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Windows 平台验证结果(2026-03-19)
|
||||||
|
|
||||||
|
### 验证状态:✓ 通过
|
||||||
|
|
||||||
|
**测试结果:**
|
||||||
|
- 所有文件类型的依赖安装测试通过
|
||||||
|
- 功能测试:295/303 测试通过
|
||||||
|
- 8 个已知失败(中文字符 PDF 解析、LibreOffice DOCX)
|
||||||
|
|
||||||
|
**验证通过的依赖:**
|
||||||
|
- PDF: docling 2.80.0, unstructured[pdf] 0.21.5, markitdown[pdf] 0.1.5, pypdf 6.9.0, markdownify 1.2.2
|
||||||
|
- DOCX: docling 2.80.0, unstructured[docx] 0.21.5, markitdown[docx] 0.1.5, pypandoc-binary 1.17, python-docx 1.2.0
|
||||||
|
- XLSX: docling 2.80.0, unstructured[xlsx] 0.21.5, markitdown[xlsx] 0.1.5, pandas 3.0.1, openpyxl 3.1.5
|
||||||
|
- PPTX: docling 2.80.0, unstructured[pptx] 0.21.5, markitdown[pptx] 0.1.5, python-pptx 1.0.2
|
||||||
|
- HTML: trafilatura 2.0.0, domscribe 0.1.3, markitdown 0.1.5, html2text 2025.4.15, beautifulsoup4 4.14.3
|
||||||
|
- XLS: unstructured[xlsx] 0.21.5, markitdown[xls] 0.1.5, pandas 3.0.1, xlrd 2.0.2, olefile 0.47
|
||||||
|
- PPT: docling 2.80.0, unstructured[pptx] 0.21.5, markitdown[pptx] 0.1.5, python-pptx 1.0.2, olefile 0.47
|
||||||
|
|
||||||
|
**已知问题:**
|
||||||
|
1. 中文字符在临时 PDF 生成中显示为 `<!-- image -->`(测试环境字体问题,不影响实际使用)
|
||||||
|
|
||||||
|
## 本次(2026-03-17)的经验总结
|
||||||
|
|
||||||
|
### Darwin-x86_64 平台的已知问题
|
||||||
|
|
||||||
|
1. **torch 无 Darwin-x86_64 wheel**(docling 2.80.0 依赖 torch)
|
||||||
|
- 解决:使用 docling 2.40.0 + docling-parse 4.0.0 + numpy<2
|
||||||
|
2. **onnxruntime 无 Darwin-x86_64 + Python 3.14 wheel**(markitdown 依赖)
|
||||||
|
- 解决:指定 python 3.12
|
||||||
|
3. **pyppeteer 2.0.0 与 selenium 4.41.0 的 urllib3 版本冲突**
|
||||||
|
- 解决:selenium 降级到 4.25.0
|
||||||
|
4. **pandas 3.0.1 与 fixtures 依赖 pandas<3.0.0 冲突**
|
||||||
|
- 解决:特定平台使用 pandas<3.0.0
|
||||||
|
|
||||||
|
### 当前依赖版本列表(截止 2026-03-19,Windows 验证通过)
|
||||||
|
|
||||||
|
| 依赖 | 版本 |
|
||||||
|
|------|------|
|
||||||
|
| docling | 2.80.0 (default) / 2.40.0 (Darwin-x86_64) |
|
||||||
|
| docling-parse | 5.5.0 (default) / 4.0.0 (Darwin-x86_64) |
|
||||||
|
| unstructured[...] | 0.21.5 |
|
||||||
|
| markitdown[...] | 0.1.5 |
|
||||||
|
| pypdf | 6.9.0 |
|
||||||
|
| markdownify | 1.2.2 |
|
||||||
|
| pypandoc-binary | 1.17 |
|
||||||
|
| python-docx | 1.2.0 |
|
||||||
|
| pandas | 3.0.1 (default) / <3.0.0 (Darwin-x86_64) |
|
||||||
|
| tabulate | 0.10.0 |
|
||||||
|
| openpyxl | 3.1.5 |
|
||||||
|
| python-pptx | 1.0.2 |
|
||||||
|
| trafilatura | 2.0.0 |
|
||||||
|
| domscribe | 0.1.3 |
|
||||||
|
| html2text | 2025.4.15 |
|
||||||
|
| beautifulsoup4 | 4.14.3 |
|
||||||
|
| httpx | 0.28.1 |
|
||||||
|
| chardet | 7.1.0 |
|
||||||
|
| pyppeteer | 2.0.0 |
|
||||||
|
| selenium | 4.25.0 (Darwin-x86_64) |
|
||||||
|
| xlrd | 2.0.2 |
|
||||||
|
| olefile | 0.47 |
|
||||||
|
| numpy | <2 (Darwin-x86_64) |
|
||||||
|
|
||||||
|
## 创建 OpenSpec 变更
|
||||||
|
|
||||||
|
使用 `/opsx:new` 或 `/opsx:ff` 创建变更,使用 spec-driven 工作流。
|
||||||
@@ -3,23 +3,22 @@ schema: spec-driven
|
|||||||
context: |
|
context: |
|
||||||
# 项目规范
|
# 项目规范
|
||||||
- 语言: 仅中文(交流/注释/文档/代码)
|
- 语言: 仅中文(交流/注释/文档/代码)
|
||||||
- Python: 始终用uv运行(脚本/临时命令uv run python -c); 禁用主机python/禁主机安装包
|
- Python: 当前项目始终用uv运行(脚本/临时命令uv run python -c); 禁用主机python/禁主机安装包
|
||||||
- 依赖: pyproject.toml声明,使用uv安装
|
|
||||||
- 主机环境: 禁止污染配置,需操作须请求用户
|
- 主机环境: 禁止污染配置,需操作须请求用户
|
||||||
- 开发文档: README.md,每次迭代按需更新开发文档; 禁emoji/特殊字符
|
- 开发文档: README.md,每次迭代按需更新开发文档; 禁emoji/特殊字符
|
||||||
- skill文档: skill/SKILL.md,每次迭代按需更新skill文档
|
- skill文档: SKILL.md,每次迭代按需更新skill文档
|
||||||
- 测试: 所有需求必须设计全面测试
|
- 测试: 所有需求必须设计全面测试,严禁跳过测试,无法进行的测试交用户决策
|
||||||
- 任务: 禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
|
- 任务: 除非用户直接要求,禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
|
||||||
- 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文
|
- 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文
|
||||||
- 项目阶段: 未上线,无用户,破坏性变更无需迁移说明
|
- 项目阶段: 未上线,无用户,破坏性变更无需迁移说明
|
||||||
- Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明
|
- Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明
|
||||||
|
- 提问: 对用户的提问优先使用提问工具而不是文字选项
|
||||||
# 项目概述
|
# 项目概述
|
||||||
- 目标:统一文档解析工具,将DOCX/XLSX/PPTX/PDF/HTML/URL 转换为 Markdown,面向AI skill使用
|
- 目标:统一文档解析工具,将各种格式的文档转换为 Markdown,面向AI skill使用
|
||||||
# 项目目录结构
|
# 项目目录结构
|
||||||
- scripts/: 核心代码目录
|
- scripts/: 核心代码目录
|
||||||
- skill/: skill文档目录
|
|
||||||
- tests/: 测试目录
|
- tests/: 测试目录
|
||||||
- openspec/: 规范文档目录
|
- openspec/: 规范文档目录
|
||||||
- temp/: 开发临时文件目录
|
- temp/: 开发临时文件目录
|
||||||
- pyproject.toml: 项目配置
|
|
||||||
- README.md: 项目开发文档
|
- README.md: 项目开发文档
|
||||||
|
- SKILL.md: skill文档
|
||||||
|
|||||||
129
openspec/specs/cli-advice/spec.md
Normal file
129
openspec/specs/cli-advice/spec.md
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
CLI 自启动机制,自动检测文件类型、平台和依赖,用正确的 uv 命令执行脚本。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: 依赖配置结构
|
||||||
|
依赖配置必须同时包含 python 版本要求和依赖包列表,按文件类型和平台组织,供自启动逻辑内部使用。
|
||||||
|
|
||||||
|
#### Scenario: 配置结构包含 python 和 dependencies
|
||||||
|
- **WHEN** 访问 `config.DEPENDENCIES` 时
|
||||||
|
- **THEN** 每个文件类型配置包含多个平台配置
|
||||||
|
- **AND** 每个平台配置包含 `python` 字段(可为 None)和 `dependencies` 列表字段
|
||||||
|
|
||||||
|
#### Scenario: default 平台配置
|
||||||
|
- **WHEN** 平台无特殊配置时
|
||||||
|
- **THEN** 使用 `default` 配置
|
||||||
|
- **AND** `python` 为 `None` 表示不需要指定 `--python` 参数
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Requirement: 轻量文件类型检测
|
||||||
|
自启动必须复用 Reader 实例的 supports 方法识别文件类型,不打开文件。
|
||||||
|
|
||||||
|
#### Scenario: 复用 Reader 实例
|
||||||
|
- **WHEN** 检测文件类型时
|
||||||
|
- **THEN** 使用已实例化的 readers 列表
|
||||||
|
- **AND** 调用每个 reader 的 supports() 方法
|
||||||
|
- **AND** 根据第一个支持的 reader 类名识别文件类型
|
||||||
|
|
||||||
|
#### Scenario: 检测 PDF 文件
|
||||||
|
- **WHEN** 输入路径以 `.pdf` 结尾(不区分大小写)
|
||||||
|
- **THEN** PdfReader.supports() 返回 True
|
||||||
|
- **AND** 识别为 PDF 类型
|
||||||
|
|
||||||
|
#### Scenario: 检测 DOCX 文件
|
||||||
|
- **WHEN** 输入路径以 `.docx` 结尾(不区分大小写)
|
||||||
|
- **THEN** DocxReader.supports() 返回 True
|
||||||
|
- **AND** 识别为 DOCX 类型
|
||||||
|
|
||||||
|
#### Scenario: 检测 XLSX 文件
|
||||||
|
- **WHEN** 输入路径以 `.xlsx` 结尾(不区分大小写)
|
||||||
|
- **THEN** XlsxReader.supports() 返回 True
|
||||||
|
- **AND** 识别为 XLSX 类型
|
||||||
|
|
||||||
|
#### Scenario: 检测 PPTX 文件
|
||||||
|
- **WHEN** 输入路径以 `.pptx` 结尾(不区分大小写)
|
||||||
|
- **THEN** PptxReader.supports() 返回 True
|
||||||
|
- **AND** 识别为 PPTX 类型
|
||||||
|
|
||||||
|
#### Scenario: 检测 HTML 文件
|
||||||
|
- **WHEN** 输入路径以 `.html` 或 `.htm` 结尾(不区分大小写)
|
||||||
|
- **THEN** HtmlReader.supports() 返回 True
|
||||||
|
- **AND** 识别为 HTML 类型
|
||||||
|
|
||||||
|
#### Scenario: 检测 URL
|
||||||
|
- **WHEN** 输入路径以 `http://` 或 `https://` 开头
|
||||||
|
- **THEN** HtmlReader.supports() 返回 True
|
||||||
|
- **AND** 识别为 HTML 类型
|
||||||
|
|
||||||
|
#### Scenario: 不验证文件存在
|
||||||
|
- **WHEN** 输入路径指向不存在的文件
|
||||||
|
- **THEN** 仍根据 reader.supports() 识别类型,不报错
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Requirement: 平台检测
|
||||||
|
必须检测当前平台并选择适配的依赖配置。
|
||||||
|
|
||||||
|
#### Scenario: 检测平台格式
|
||||||
|
- **WHEN** 工具执行时
|
||||||
|
- **THEN** 返回格式为 `{system}-{machine}`,例如 `Darwin-arm64`、`Linux-x86_64`、`Windows-AMD64`
|
||||||
|
|
||||||
|
#### Scenario: macOS x86_64 PDF 特殊配置
|
||||||
|
- **WHEN** 平台为 `Darwin-x86_64` 且文件类型为 PDF
|
||||||
|
- **THEN** 使用包含 `--python 3.12` 和特定版本依赖的配置
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Requirement: 自启动检测
|
||||||
|
脚本必须自动检测文件类型、当前平台和 uv 可用性,如 uv 可用则用正确的 uv 命令启动 bootstrap.py。
|
||||||
|
|
||||||
|
#### Scenario: 检测文件类型
|
||||||
|
- **WHEN** 脚本启动时
|
||||||
|
- **THEN** 复用 Reader 的 supports() 方法识别文件类型
|
||||||
|
- **AND** 不打开文件,仅做轻量检测
|
||||||
|
|
||||||
|
#### Scenario: 检测平台
|
||||||
|
- **WHEN** 脚本启动时
|
||||||
|
- **THEN** 检测当前平台,格式为 `{system}-{machine}`
|
||||||
|
- **AND** 根据平台选择正确的依赖配置
|
||||||
|
|
||||||
|
#### Scenario: 检测 uv 是否可用
|
||||||
|
- **WHEN** 准备自启动前
|
||||||
|
- **THEN** 使用 `shutil.which("uv")` 检测 uv 是否在 PATH 中
|
||||||
|
- **AND** 如果 uv 不可用,降级为直接执行 bootstrap.py
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Requirement: 自启动执行
|
||||||
|
脚本必须使用 `subprocess.run()` 启动子进程,用正确的 uv 命令启动 bootstrap.py。
|
||||||
|
|
||||||
|
#### Scenario: 生成 uv 命令
|
||||||
|
- **WHEN** 脚本确定需要自启动
|
||||||
|
- **THEN** 根据文件类型和平台获取依赖配置
|
||||||
|
- **AND** 生成 `uv run [--python X.Y] --with <dep1> --with <dep2> ... scripts/bootstrap.py <input_path>` 命令
|
||||||
|
- **AND** 目标脚本是 bootstrap.py,不是 lyxy_document_reader.py
|
||||||
|
|
||||||
|
#### Scenario: 自启动设置环境变量
|
||||||
|
- **WHEN** 执行 `subprocess.run()` 自启动
|
||||||
|
- **THEN** 必须设置 `PYTHONPATH=.`
|
||||||
|
- **AND** 不需要设置 `LYXY_IN_UV`(自启动直接调用 bootstrap.py)
|
||||||
|
- **AND** 必须传递退出码给父进程
|
||||||
|
|
||||||
|
#### Scenario: 静默自启动
|
||||||
|
- **WHEN** 脚本执行自启动
|
||||||
|
- **THEN** 不输出任何额外提示信息
|
||||||
|
- **AND** 不干扰正常的 Markdown 输出
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Requirement: 降级执行
|
||||||
|
当 uv 不可用时,脚本必须降级为直接导入并执行 bootstrap.py。
|
||||||
|
|
||||||
|
#### Scenario: uv 不可用时降级
|
||||||
|
- **WHEN** uv 不在 PATH 中
|
||||||
|
- **THEN** 脚本直接导入 bootstrap 模块
|
||||||
|
- **AND** 调用 bootstrap.run_normal() 执行
|
||||||
|
- **AND** 如果缺少依赖,输出正常的 `ModuleNotFoundError`
|
||||||
61
openspec/specs/doc-reader/spec.md
Normal file
61
openspec/specs/doc-reader/spec.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
DOC 文档解析能力,支持解析 Microsoft Word 97-2003 旧格式文档。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: DOC 文档解析
|
||||||
|
系统 SHALL 支持解析 .doc 格式文档,使用 LibreOffice 解析器。
|
||||||
|
|
||||||
|
#### Scenario: 使用 LibreOffice 解析器
|
||||||
|
- **WHEN** 解析 DOC 文档
|
||||||
|
- **THEN** 系统使用 LibreOffice soffice 命令行进行解析
|
||||||
|
|
||||||
|
#### Scenario: 成功解析
|
||||||
|
- **WHEN** 解析器成功
|
||||||
|
- **THEN** 系统返回解析结果
|
||||||
|
|
||||||
|
#### Scenario: 解析器失败
|
||||||
|
- **WHEN** 解析器失败
|
||||||
|
- **THEN** 系统返回失败列表并退出非零状态码
|
||||||
|
|
||||||
|
### Requirement: LibreOffice 解析器
|
||||||
|
系统 SHALL 支持使用 LibreOffice soffice 命令行解析 DOC。
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 解析成功
|
||||||
|
- **WHEN** soffice 可用且文档有效
|
||||||
|
- **THEN** 系统返回 Markdown 内容
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 未安装
|
||||||
|
- **WHEN** soffice 未在 PATH 中
|
||||||
|
- **THEN** 系统返回失败信息
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 转换超时
|
||||||
|
- **WHEN** soffice 执行超过 60 秒
|
||||||
|
- **THEN** 系统返回超时错误
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 转换失败
|
||||||
|
- **WHEN** soffice 返回非零退出码
|
||||||
|
- **THEN** 系统返回失败信息
|
||||||
|
|
||||||
|
### Requirement: 解析器独立文件
|
||||||
|
系统 SHALL 将解析器实现为独立的单文件模块。
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 解析器在独立文件
|
||||||
|
- **WHEN** 使用 LibreOffice 解析器
|
||||||
|
- **THEN** 从 readers/doc/libreoffice.py 导入
|
||||||
|
|
||||||
|
### Requirement: DOC Reader 测试使用静态文件
|
||||||
|
DOC Reader 测试 MUST 使用 `tests/test_readers/fixtures/doc/` 下的静态文件。
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 simple.doc
|
||||||
|
- **WHEN** 测试 DOC Reader 基础解析能力
|
||||||
|
- **THEN** 使用 `simple.doc` 静态文件
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 with_headings.doc
|
||||||
|
- **WHEN** 测试 DOC Reader 标题解析
|
||||||
|
- **THEN** 使用 `with_headings.doc` 静态文件
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 with_table.doc
|
||||||
|
- **WHEN** 测试 DOC Reader 表格解析
|
||||||
|
- **THEN** 使用 `with_table.doc` 静态文件
|
||||||
@@ -9,7 +9,7 @@ DOCX 文档解析能力,支持多种解析方法。
|
|||||||
|
|
||||||
#### Scenario: 按优先级尝试解析器
|
#### Scenario: 按优先级尝试解析器
|
||||||
- **WHEN** 解析 DOCX 文档
|
- **WHEN** 解析 DOCX 文档
|
||||||
- **THEN** 系统按 docling → unstructured → markitdown → pypandoc-binary → python-docx → XML原生解析的顺序尝试
|
- **THEN** 系统按 docling → unstructured → pypandoc-binary → MarkItDown → LibreOffice → python-docx → XML原生解析的顺序尝试
|
||||||
|
|
||||||
#### Scenario: 成功解析
|
#### Scenario: 成功解析
|
||||||
- **WHEN** 任一解析器成功
|
- **WHEN** 任一解析器成功
|
||||||
@@ -85,6 +85,25 @@ DOCX 文档解析能力,支持多种解析方法。
|
|||||||
- **WHEN** XML 原生解析失败
|
- **WHEN** XML 原生解析失败
|
||||||
- **THEN** 系统返回失败信息
|
- **THEN** 系统返回失败信息
|
||||||
|
|
||||||
|
### Requirement: LibreOffice 解析器
|
||||||
|
系统 SHALL 支持使用 LibreOffice soffice 命令行解析 DOCX。
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 解析成功
|
||||||
|
- **WHEN** soffice 可用且文档有效
|
||||||
|
- **THEN** 系统返回 Markdown 内容
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 未安装
|
||||||
|
- **WHEN** soffice 未在 PATH 中
|
||||||
|
- **THEN** 系统尝试下一个解析器
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 转换超时
|
||||||
|
- **WHEN** soffice 执行超过 60 秒
|
||||||
|
- **THEN** 系统返回超时错误并尝试下一个解析器
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 转换失败
|
||||||
|
- **WHEN** soffice 返回非零退出码
|
||||||
|
- **THEN** 系统返回失败信息并尝试下一个解析器
|
||||||
|
|
||||||
### Requirement: 每个解析器独立文件
|
### Requirement: 每个解析器独立文件
|
||||||
系统 SHALL 将每个解析器实现为独立的单文件模块。
|
系统 SHALL 将每个解析器实现为独立的单文件模块。
|
||||||
|
|
||||||
@@ -111,3 +130,7 @@ DOCX 文档解析能力,支持多种解析方法。
|
|||||||
#### Scenario: XML 原生解析器在独立文件
|
#### Scenario: XML 原生解析器在独立文件
|
||||||
- **WHEN** 使用 XML 原生解析器
|
- **WHEN** 使用 XML 原生解析器
|
||||||
- **THEN** 从 readers/docx/native_xml.py 导入
|
- **THEN** 从 readers/docx/native_xml.py 导入
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 解析器在独立文件
|
||||||
|
- **WHEN** 使用 LibreOffice 解析器
|
||||||
|
- **THEN** 从 readers/docx/libreoffice.py 导入
|
||||||
|
|||||||
109
openspec/specs/multi-platform-dependencies/spec.md
Normal file
109
openspec/specs/multi-platform-dependencies/spec.md
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
# 多平台依赖管理
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
为不同平台提供特定的依赖配置,解决平台特定的依赖兼容性问题(如 macOS x86_64 的 docling-parse 版本限制)。通过 `uv run --with` 方式按需加载依赖,在文档中提供平台特定的命令示例。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: 平台检测文档
|
||||||
|
系统必须在 SKILL.md 中提供平台检测方法和平台特定的 `uv run --with` 命令示例。
|
||||||
|
|
||||||
|
#### Scenario: 平台检测命令
|
||||||
|
- **WHEN** 用户阅读 SKILL.md 中的多平台依赖安装指南
|
||||||
|
- **THEN** 系统必须提供以下平台的检测命令:
|
||||||
|
- macOS / Linux: `uname -m` 和 `uname -s`
|
||||||
|
- Windows: PowerShell 环境变量检测
|
||||||
|
- Python 跨平台检测: `import platform; print(f'{platform.system()}-{platform.machine()}')`
|
||||||
|
|
||||||
|
#### Scenario: macOS x86_64 特殊说明
|
||||||
|
- **WHEN** 用户在 macOS x86_64 平台阅读 PDF 解析依赖的安装说明
|
||||||
|
- **THEN** 系统必须明确说明以下特殊要求:
|
||||||
|
- 必须使用 Python 3.12
|
||||||
|
- `docling-parse` 5.x 无 x86_64 wheel,必须使用 4.0.0
|
||||||
|
- 提供完整的 `uv run --python 3.12 --with "docling==2.40.0" --with "docling-parse==4.0.0" --with "numpy<2" ...` 命令示例
|
||||||
|
- unstructured 在 Darwin-x86_64 平台不可用,已从配置中移除
|
||||||
|
|
||||||
|
#### Scenario: 每个平台的运行命令
|
||||||
|
- **WHEN** 用户阅读 SKILL.md
|
||||||
|
- **THEN** 系统必须为每个平台(Windows/macOS Intel/macOS ARM/Linux)和每种文档格式提供清晰的 `uv run --with` 命令示例
|
||||||
|
- **AND** 命令必须包含所有必需的依赖包
|
||||||
|
|
||||||
|
### Requirement: 依赖配置结构
|
||||||
|
config.py 中的 DEPENDENCIES 配置使用字典结构,保持简单直接以便于在不同平台进行细致调整。
|
||||||
|
|
||||||
|
#### Scenario: 配置数据格式不变
|
||||||
|
- **WHEN** 代码访问 config.DEPENDENCIES["pdf"]["default"]
|
||||||
|
- **THEN** 返回的数据结构保持不变
|
||||||
|
- **AND** 包含 "python" 和 "dependencies" 字段
|
||||||
|
|
||||||
|
#### Scenario: 所有文件类型都有 Darwin-x86_64 配置
|
||||||
|
- **WHEN** 查看 config.DEPENDENCIES
|
||||||
|
- **THEN** pdf/docx/xlsx/pptx/xls/ppt 都有 "Darwin-x86_64" 平台配置
|
||||||
|
- **AND** Darwin-x86_64 配置中不包含 unstructured 相关依赖
|
||||||
|
|
||||||
|
### Requirement: 依赖版本管理
|
||||||
|
所有依赖必须指定版本号;default 平台使用最新版本作为标杆;default 配置在当前平台测试失败时,在特定平台配置中探索可运行的最新版本;default 配置的 python 版本必须为 None(使用默认 python 版本),仅在特定平台配置中可指定 python 版本;当前版本截止时间为 2026-03-18。
|
||||||
|
|
||||||
|
#### Scenario: default 平台使用最新版本且 python 为 None
|
||||||
|
- **WHEN** 查看 config.DEPENDENCIES 中 default 配置
|
||||||
|
- **THEN** python 版本为 None
|
||||||
|
- **AND** 所有依赖都有明确的版本号
|
||||||
|
- **AND** 使用截止 2026-03-18 的最新版本
|
||||||
|
|
||||||
|
#### Scenario: 特定平台在 default 失败时探索可运行版本
|
||||||
|
- **WHEN** default 配置在当前平台测试失败
|
||||||
|
- **THEN** 在特定平台配置中探索可运行的最新版本
|
||||||
|
|
||||||
|
### Requirement: 平台检测文档
|
||||||
|
系统必须在 `SKILL.md` 中提供平台检测方法和平台特定的安装指南。
|
||||||
|
|
||||||
|
#### Scenario: 平台检测命令
|
||||||
|
- **WHEN** 用户阅读 `SKILL.md` 中的多平台依赖安装指南
|
||||||
|
- **THEN** 系统必须提供以下平台的检测命令:
|
||||||
|
- macOS / Linux: `uname -m` 和 `uname -s`
|
||||||
|
- Windows: PowerShell 环境变量检测
|
||||||
|
- Python 跨平台检测: `import platform; print(f'{platform.system()}-{platform.machine()}')`
|
||||||
|
|
||||||
|
#### Scenario: macOS x86_64 特殊说明
|
||||||
|
- **WHEN** 用户在 macOS x86_64 平台阅读 PDF 解析依赖的安装说明
|
||||||
|
- **THEN** 系统必须明确说明以下特殊要求:
|
||||||
|
- 必须使用 Python 3.12
|
||||||
|
- `docling-parse` 5.x 无 x86_64 wheel,必须使用 4.0.0
|
||||||
|
|
||||||
|
#### Scenario: 每个平台的安装命令
|
||||||
|
- **WHEN** 用户阅读 `SKILL.md`
|
||||||
|
- **THEN** 系统必须为每个平台(Windows/macOS Intel/macOS ARM/Linux)提供清晰的 `uv run` 命令示例
|
||||||
|
|
||||||
|
### Requirement: Lock 文件管理
|
||||||
|
系统必须移除 `uv.lock` 文件,每次 `uv run` 都是全新的依赖解析。
|
||||||
|
|
||||||
|
#### Scenario: 移除 uv.lock 文件
|
||||||
|
- **WHEN** 用户查看项目根目录
|
||||||
|
- **THEN** 系统必须不包含 uv.lock 文件
|
||||||
|
- **AND** 依赖版本由文档中的版本约束说明
|
||||||
|
|
||||||
|
#### Scenario: gitignore 配置(可选)
|
||||||
|
- **WHEN** 用户查看项目的 `.gitignore` 文件
|
||||||
|
- **THEN** 系统可以包含 `uv.lock` 条目以确保不会误提交(如果用户重新创建了 lock 文件)
|
||||||
|
|
||||||
|
### Requirement: 当前平台依赖验证
|
||||||
|
系统必须在当前平台上验证 `config.DEPENDENCIES` 的 default 配置是否可以正常工作。
|
||||||
|
|
||||||
|
#### Scenario: 验证 default 配置可用性
|
||||||
|
- **WHEN** 在当前平台运行测试
|
||||||
|
- **THEN** 必须验证 default 配置的所有依赖都可以正确安装
|
||||||
|
- **AND** 必须验证所有文档类型的解析功能正常工作
|
||||||
|
|
||||||
|
#### Scenario: 记录验证结果
|
||||||
|
- **WHEN** 完成当前平台的依赖验证
|
||||||
|
- **THEN** 必须在 `docs/upgrade-deps-prompt.md` 中记录验证结果
|
||||||
|
- **AND** 必须记录当前平台信息和测试通过日期
|
||||||
|
|
||||||
|
### Requirement: 依赖版本文档化
|
||||||
|
系统必须在 `docs/upgrade-deps-prompt.md` 中记录当前所有依赖的版本号和更新时间戳。
|
||||||
|
|
||||||
|
#### Scenario: 版本记录包含所有依赖
|
||||||
|
- **WHEN** 查看 `docs/upgrade-deps-prompt.md`
|
||||||
|
- **THEN** 文档必须包含所有文件类型(pdf/docx/xlsx/pptx/html/xls/ppt/doc)的所有依赖版本号
|
||||||
|
- **AND** 必须标注版本更新时间戳
|
||||||
58
openspec/specs/ppt-reader/spec.md
Normal file
58
openspec/specs/ppt-reader/spec.md
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
PPT 文档解析能力,支持解析 Microsoft PowerPoint 97-2003 旧格式文档。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: PPT 文档解析
|
||||||
|
系统 SHALL 支持解析 .ppt 格式文档,使用 LibreOffice 解析器。
|
||||||
|
|
||||||
|
#### Scenario: 使用 LibreOffice 解析器
|
||||||
|
- **WHEN** 解析 PPT 文档
|
||||||
|
- **THEN** 系统使用 LibreOffice soffice 将 PPT 转换为 PPTX
|
||||||
|
- **AND** 复用 PptxReader 解析转换后的 PPTX
|
||||||
|
|
||||||
|
#### Scenario: 成功解析
|
||||||
|
- **WHEN** 解析器成功
|
||||||
|
- **THEN** 系统返回解析结果
|
||||||
|
|
||||||
|
#### Scenario: 解析器失败
|
||||||
|
- **WHEN** 解析器失败
|
||||||
|
- **THEN** 系统返回失败列表并退出非零状态码
|
||||||
|
|
||||||
|
### Requirement: LibreOffice 解析器
|
||||||
|
系统 SHALL 支持使用 LibreOffice soffice 命令行解析 PPT。
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 解析成功
|
||||||
|
- **WHEN** soffice 可用且文档有效
|
||||||
|
- **THEN** 系统返回 Markdown 内容
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 未安装
|
||||||
|
- **WHEN** soffice 未在 PATH 中
|
||||||
|
- **THEN** 系统返回失败信息
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 转换超时
|
||||||
|
- **WHEN** soffice 执行超过 60 秒
|
||||||
|
- **THEN** 系统返回超时错误
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 转换失败
|
||||||
|
- **WHEN** soffice 返回非零退出码
|
||||||
|
- **THEN** 系统返回失败信息
|
||||||
|
|
||||||
|
#### Scenario: 临时文件自动清理
|
||||||
|
- **WHEN** 解析完成(无论成功或失败)
|
||||||
|
- **THEN** 转换过程中生成的临时 PPTX 文件被自动清理
|
||||||
|
|
||||||
|
### Requirement: 解析器独立文件
|
||||||
|
系统 SHALL 将解析器实现为独立的单文件模块。
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 解析器在独立文件
|
||||||
|
- **WHEN** 使用 LibreOffice 解析器
|
||||||
|
- **THEN** 从 readers/ppt/libreoffice.py 导入
|
||||||
|
|
||||||
|
### Requirement: PPT Reader 测试使用静态文件
|
||||||
|
PPT Reader 测试 MUST 使用 `tests/test_readers/fixtures/ppt/` 下的静态文件。
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 simple.ppt
|
||||||
|
- **WHEN** 测试 PPT Reader 基础解析能力
|
||||||
|
- **THEN** 使用 `simple.ppt` 静态文件
|
||||||
129
openspec/specs/reader-internal-utils/spec.md
Normal file
129
openspec/specs/reader-internal-utils/spec.md
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
提供 Reader 内部共享工具模块,包含解析器包装函数、格式化工具、ZIP 安全处理和 unstructured 库集成。此模块仅供 readers 包内部使用,不作为公共 API。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: 解析器包装函数
|
||||||
|
系统 SHALL 提供统一的解析器包装函数,封装第三方库的调用细节。
|
||||||
|
|
||||||
|
#### Scenario: 使用 MarkItDown 解析
|
||||||
|
- **WHEN** 调用 `parse_via_markitdown(file_path)`
|
||||||
|
- **THEN** 系统使用 MarkItDown 库解析文件
|
||||||
|
- **AND** 成功时返回 `(markdown_content, None)`
|
||||||
|
- **AND** 失败时返回 `(None, error_message)`
|
||||||
|
|
||||||
|
#### Scenario: 使用 docling 解析
|
||||||
|
- **WHEN** 调用 `parse_via_docling(file_path)`
|
||||||
|
- **THEN** 系统使用 docling 库解析文件
|
||||||
|
- **AND** 成功时返回 `(markdown_content, None)`
|
||||||
|
- **AND** 失败时返回 `(None, error_message)`
|
||||||
|
|
||||||
|
#### Scenario: 库未安装时返回友好错误
|
||||||
|
- **WHEN** 调用解析器包装函数但对应库未安装
|
||||||
|
- **THEN** 系统返回 `(None, "<库名> 库未安装")`
|
||||||
|
|
||||||
|
### Requirement: Markdown 表格格式化
|
||||||
|
系统 SHALL 提供将二维列表格式化为 Markdown 表格的工具函数。
|
||||||
|
|
||||||
|
#### Scenario: 格式化标准表格
|
||||||
|
- **WHEN** 调用 `build_markdown_table(rows_data)` 且 rows_data 包含表头和数据行
|
||||||
|
- **THEN** 系统生成标准 Markdown 表格格式
|
||||||
|
- **AND** 第一行前生成分隔行(`| --- | --- |`)
|
||||||
|
|
||||||
|
#### Scenario: 空数据返回空字符串
|
||||||
|
- **WHEN** 调用 `build_markdown_table([])` 或 `build_markdown_table([[]])`
|
||||||
|
- **THEN** 系统返回空字符串
|
||||||
|
|
||||||
|
### Requirement: 列表堆栈处理
|
||||||
|
系统 SHALL 提供列表堆栈处理工具函数,用于处理嵌套列表的格式化输出。
|
||||||
|
|
||||||
|
#### Scenario: 刷新列表堆栈
|
||||||
|
- **WHEN** 调用 `flush_list_stack(list_stack, target)`
|
||||||
|
- **THEN** 系统将 list_stack 中所有非空项添加到 target 列表
|
||||||
|
- **AND** 每个项末尾添加换行符
|
||||||
|
- **AND** 清空 list_stack
|
||||||
|
|
||||||
|
#### Scenario: 跳过空项
|
||||||
|
- **WHEN** list_stack 中包含空字符串
|
||||||
|
- **THEN** 系统跳过空项,不添加到 target
|
||||||
|
|
||||||
|
### Requirement: ZIP 文件安全打开
|
||||||
|
系统 SHALL 提供安全的 ZIP 文件打开函数,防止路径遍历攻击。
|
||||||
|
|
||||||
|
#### Scenario: 打开合法文件
|
||||||
|
- **WHEN** 调用 `safe_open_zip(zip_file, "valid/file.txt")`
|
||||||
|
- **THEN** 系统返回对应的 ZipExtFile 对象
|
||||||
|
|
||||||
|
#### Scenario: 拒绝路径遍历攻击
|
||||||
|
- **WHEN** 路径包含 ".." 在 Path.parts 中
|
||||||
|
- **THEN** 系统返回 None
|
||||||
|
|
||||||
|
#### Scenario: 拒绝绝对路径
|
||||||
|
- **WHEN** 路径为绝对路径
|
||||||
|
- **THEN** 系统返回 None
|
||||||
|
|
||||||
|
#### Scenario: 处理路径异常
|
||||||
|
- **WHEN** Path() 抛出 ValueError 或 OSError
|
||||||
|
- **THEN** 系统捕获异常并返回 None
|
||||||
|
|
||||||
|
### Requirement: unstructured 元素转换
|
||||||
|
系统 SHALL 提供将 unstructured 库解析的元素转换为 Markdown 的工具函数。
|
||||||
|
|
||||||
|
#### Scenario: 转换标准元素
|
||||||
|
- **WHEN** 调用 `convert_unstructured_to_markdown(elements, trust_titles=True)`
|
||||||
|
- **THEN** 系统跳过 Header、Footer、PageBreak、PageNumber 元素
|
||||||
|
- **AND** 跳过 RGB 颜色值和页码噪声
|
||||||
|
- **AND** Table 元素转换为 Markdown 表格
|
||||||
|
- **AND** Title 元素转换为 # 标题(根据 category_depth 确定级别)
|
||||||
|
- **AND** ListItem 元素转换为 - 列表项
|
||||||
|
- **AND** Image 元素转换为  格式
|
||||||
|
|
||||||
|
#### Scenario: 库未安装时回退
|
||||||
|
- **WHEN** markdownify 或 unstructured 库未安装
|
||||||
|
- **THEN** 系统提取所有元素的 text 属性并用双换行连接
|
||||||
|
|
||||||
|
### Requirement: 噪声模式匹配
|
||||||
|
系统 SHALL 定义 unstructured 库的噪声匹配模式。
|
||||||
|
|
||||||
|
#### Scenario: 匹配 RGB 颜色值
|
||||||
|
- **WHEN** 文本匹配 `_UNSTRUCTURED_RGB_PATTERN`(如 "R:255 G:128 B:0")
|
||||||
|
- **THEN** 系统将其识别为噪声并过滤
|
||||||
|
|
||||||
|
#### Scenario: 匹配页码
|
||||||
|
- **WHEN** 文本匹配 `_UNSTRUCTURED_PAGE_NUMBER_PATTERN`(如 "— 3 —")
|
||||||
|
- **THEN** 系统将其识别为噪声并过滤
|
||||||
|
|
||||||
|
### Requirement: 通用 LibreOffice 格式转换
|
||||||
|
系统 SHALL 提供通用的 LibreOffice 格式转换函数,支持在不同格式间转换。
|
||||||
|
|
||||||
|
#### Scenario: 转换文件到指定格式
|
||||||
|
- **WHEN** 调用 `convert_via_libreoffice(input_path, target_format, output_dir)`
|
||||||
|
- **THEN** 系统使用 soffice --headless --convert-to 进行转换
|
||||||
|
- **AND** 输出文件写入 output_dir
|
||||||
|
- **AND** 成功时返回 (output_path, None)
|
||||||
|
- **AND** 失败时返回 (None, error_message)
|
||||||
|
|
||||||
|
#### Scenario: LibreOffice 未安装
|
||||||
|
- **WHEN** soffice 未在 PATH 中
|
||||||
|
- **THEN** 系统返回 (None, "LibreOffice 未安装")
|
||||||
|
|
||||||
|
#### Scenario: 转换超时
|
||||||
|
- **WHEN** soffice 执行超过 timeout 秒(默认 60 秒)
|
||||||
|
- **THEN** 系统返回 (None, "LibreOffice 转换超时")
|
||||||
|
|
||||||
|
#### Scenario: 转换失败
|
||||||
|
- **WHEN** soffice 返回非零退出码
|
||||||
|
- **THEN** 系统返回 (None, "LibreOffice 转换失败 (code: {code})")
|
||||||
|
|
||||||
|
#### Scenario: 输出文件未生成
|
||||||
|
- **WHEN** soffice 执行成功但未生成输出文件
|
||||||
|
- **THEN** 系统返回 (None, "LibreOffice 未生成输出文件")
|
||||||
|
|
||||||
|
#### Scenario: 可自定义输出后缀
|
||||||
|
- **WHEN** 提供 output_suffix 参数
|
||||||
|
- **THEN** 系统使用该后缀作为输出文件后缀,而不是 target_format
|
||||||
|
|
||||||
|
#### Scenario: 调用者管理输出目录生命周期
|
||||||
|
- **WHEN** convert_via_libreoffice 执行完成
|
||||||
|
- **THEN** 输出文件保留在 output_dir 中,由调用者负责清理
|
||||||
@@ -117,3 +117,60 @@ Reader MUST 正确处理包含特殊字符的内容。
|
|||||||
#### Scenario: 每个 PDF Reader 有独立测试
|
#### Scenario: 每个 PDF Reader 有独立测试
|
||||||
- **WHEN** 查看 test_readers/test_pdf/ 目录
|
- **WHEN** 查看 test_readers/test_pdf/ 目录
|
||||||
- **THEN** 存在 test_pypdf.py、test_markitdown.py、test_docling.py 等独立文件
|
- **THEN** 存在 test_pypdf.py、test_markitdown.py、test_docling.py 等独立文件
|
||||||
|
|
||||||
|
### Requirement: 旧格式文档测试覆盖
|
||||||
|
doc/xls/ppt 旧格式文档 MUST 有与新格式(docx/xlsx/pptx)一致的测试覆盖。
|
||||||
|
|
||||||
|
#### Scenario: doc 有一致性测试
|
||||||
|
- **WHEN** 查看 `tests/test_readers/test_doc/`
|
||||||
|
- **THEN** 存在 `test_consistency.py` 测试所有 DOC Readers 解析结果一致性
|
||||||
|
|
||||||
|
#### Scenario: xls 有一致性测试
|
||||||
|
- **WHEN** 查看 `tests/test_readers/test_xls/`
|
||||||
|
- **THEN** 存在 `test_consistency.py` 测试所有 XLS Readers 解析结果一致性
|
||||||
|
|
||||||
|
#### Scenario: ppt 有一致性测试
|
||||||
|
- **WHEN** 查看 `tests/test_readers/test_ppt/`
|
||||||
|
- **THEN** 存在 `test_consistency.py` 测试所有 PPT Readers 解析结果一致性
|
||||||
|
|
||||||
|
#### Scenario: doc 各解析器独立测试
|
||||||
|
- **WHEN** 查看 `tests/test_readers/test_doc/`
|
||||||
|
- **THEN** 每个解析器有独立测试文件(如 `test_markitdown_doc.py`、`test_pypandoc_doc.py`)
|
||||||
|
|
||||||
|
#### Scenario: xls 各解析器独立测试
|
||||||
|
- **WHEN** 查看 `tests/test_readers/test_xls/`
|
||||||
|
- **THEN** 每个解析器有独立测试文件(如 `test_markitdown_xls.py`、`test_unstructured_xls.py`、`test_pandas_xls.py`)
|
||||||
|
|
||||||
|
#### Scenario: ppt 各解析器独立测试
|
||||||
|
- **WHEN** 查看 `tests/test_readers/test_ppt/`
|
||||||
|
- **THEN** 每个解析器有独立测试文件(如 `test_markitdown_ppt.py`)
|
||||||
|
|
||||||
|
### Requirement: 旧格式测试使用静态文件
|
||||||
|
旧格式文档测试 MUST 使用静态测试文件,而非尝试自动化创建。
|
||||||
|
|
||||||
|
#### Scenario: doc 测试使用静态文件
|
||||||
|
- **WHEN** 运行 doc 相关测试
|
||||||
|
- **THEN** 测试从 `tests/test_readers/fixtures/doc/` 读取静态文件
|
||||||
|
|
||||||
|
#### Scenario: xls 测试使用静态文件
|
||||||
|
- **WHEN** 运行 xls 相关测试
|
||||||
|
- **THEN** 测试从 `tests/test_readers/fixtures/xls/` 读取静态文件
|
||||||
|
|
||||||
|
#### Scenario: ppt 测试使用静态文件
|
||||||
|
- **WHEN** 运行 ppt 相关测试
|
||||||
|
- **THEN** 测试从 `tests/test_readers/fixtures/ppt/` 读取静态文件
|
||||||
|
|
||||||
|
### Requirement: 静态文件缺失时优雅跳过
|
||||||
|
当静态测试文件不存在时,测试 MUST 优雅跳过,而非失败。
|
||||||
|
|
||||||
|
#### Scenario: doc 静态文件不存在时跳过
|
||||||
|
- **WHEN** `simple.doc` 不存在
|
||||||
|
- **THEN** 相关测试使用 `pytest.skip()` 跳过
|
||||||
|
|
||||||
|
#### Scenario: xls 静态文件不存在时跳过
|
||||||
|
- **WHEN** `simple.xls` 不存在
|
||||||
|
- **THEN** 相关测试使用 `pytest.skip()` 跳过
|
||||||
|
|
||||||
|
#### Scenario: ppt 静态文件不存在时跳过
|
||||||
|
- **WHEN** `simple.ppt` 不存在
|
||||||
|
- **THEN** 相关测试使用 `pytest.skip()` 跳过
|
||||||
|
|||||||
@@ -12,48 +12,37 @@ SKILL.md 文档必须遵循 Claude 官方 Skill 构建指南的最佳实践,
|
|||||||
- **THEN** AI 应能从 Purpose 和 When to Use 章节了解何时使用此 skill
|
- **THEN** AI 应能从 Purpose 和 When to Use 章节了解何时使用此 skill
|
||||||
|
|
||||||
### Requirement: YAML frontmatter 包含完整元数据
|
### Requirement: YAML frontmatter 包含完整元数据
|
||||||
YAML frontmatter 必须包含 name、description(带触发词)、license、metadata 和 compatibility 字段。
|
YAML frontmatter 必须包含 name、description(带触发词)、license 和 compatibility 字段。
|
||||||
|
|
||||||
#### Scenario: description 包含触发词
|
#### Scenario: description 包含触发词
|
||||||
- **WHEN** 查看 YAML frontmatter
|
- **WHEN** 查看 YAML frontmatter
|
||||||
- **THEN** description 应包含功能说明、触发条件和用户可能说的具体任务
|
- **THEN** description 应包含功能说明、触发条件和用户可能说的具体任务
|
||||||
|
|
||||||
#### Scenario: compatibility 说明依赖
|
#### Scenario: compatibility 说明依赖和 Python 版本
|
||||||
- **WHEN** 查看 YAML frontmatter
|
- **WHEN** 查看 YAML frontmatter
|
||||||
- **THEN** compatibility 应说明 Python 版本要求和两种执行路径的依赖情况
|
- **THEN** compatibility 应说明 Python 版本要求和脚本自启动特性
|
||||||
|
|
||||||
### Requirement: 双路径执行策略
|
### Requirement: 推荐用法
|
||||||
skill 文档必须说明两种执行路径:优先使用 lyxy-runner-python skill,回退到主机 Python 环境。
|
SKILL.md 必须说明直接运行 `python scripts/lyxy_document_reader.py` 即可,脚本会自动检测文件类型、当前平台,并用正确的 uv 命令执行。
|
||||||
|
|
||||||
#### Scenario: lyxy-runner-python 可用
|
#### Scenario: 直接运行脚本
|
||||||
- **WHEN** lyxy-runner-python skill 已安装
|
- **WHEN** AI 需要执行文档解析
|
||||||
- **THEN** 文档说明使用 lyxy-runner-python 自动管理依赖
|
- **THEN** 运行 `python scripts/lyxy_document_reader.py <文件路径或URL>`
|
||||||
|
|
||||||
#### Scenario: lyxy-runner-python 不可用
|
#### Scenario: 脚本自动检测
|
||||||
- **WHEN** lyxy-runner-python skill 不可用
|
- **WHEN** 运行脚本
|
||||||
- **THEN** 文档说明如何手动安装具体依赖包并使用主机 Python
|
- **THEN** 脚本自动检测文件类型、当前平台,并用正确的 uv 命令执行
|
||||||
|
|
||||||
### Requirement: 依赖说明使用具体包名
|
|
||||||
文档必须列出每个文档类型需要的具体 pip 包名,不能使用 lyxy-document[xxx] 形式(因为发布时没有 pyproject.toml)。
|
|
||||||
|
|
||||||
#### Scenario: 用户安装 DOCX 依赖
|
|
||||||
- **WHEN** 用户需要解析 DOCX 文档
|
|
||||||
- **THEN** 文档列出具体命令:pip install docling unstructured markitdown pypandoc-binary python-docx markdownify chardet
|
|
||||||
|
|
||||||
#### Scenario: 用户安装 PDF 依赖
|
|
||||||
- **WHEN** 用户需要解析 PDF 文档
|
|
||||||
- **THEN** 文档列出具体命令:pip install docling unstructured unstructured-paddleocr markitdown pypdf markdownify chardet
|
|
||||||
|
|
||||||
### Requirement: 文档包含关键章节
|
### Requirement: 文档包含关键章节
|
||||||
SKILL.md 必须包含 Purpose、When to Use、Quick Reference、Workflow 等章节,遵循渐进式披露原则。
|
SKILL.md 必须包含 Purpose、When to Use、Quick Reference、参数使用示例等章节,遵循渐进式披露原则。
|
||||||
|
|
||||||
#### Scenario: 快速查找用法
|
#### Scenario: 快速查找用法
|
||||||
- **WHEN** AI 需要了解如何使用此 skill
|
- **WHEN** AI 需要了解如何使用此 skill
|
||||||
- **THEN** Quick Reference 表格提供命令参数概览
|
- **THEN** Quick Reference 表格提供命令参数概览
|
||||||
|
|
||||||
#### Scenario: 了解执行流程
|
#### Scenario: 了解参数用法
|
||||||
- **WHEN** AI 需要理解解析流程
|
- **WHEN** AI 需要了解参数用法
|
||||||
- **THEN** Workflow 章节说明 4 步工作流程
|
- **THEN** 参数使用示例章节提供简洁的命令示例
|
||||||
|
|
||||||
### Requirement: 触发词覆盖多种表达方式
|
### Requirement: 触发词覆盖多种表达方式
|
||||||
description 和 When to Use 章节必须包含中文和英文的触发词,以及文件扩展名。
|
description 和 When to Use 章节必须包含中文和英文的触发词,以及文件扩展名。
|
||||||
@@ -71,7 +60,7 @@ description 和 When to Use 章节必须包含中文和英文的触发词,以
|
|||||||
|
|
||||||
#### Scenario: 依赖缺失错误
|
#### Scenario: 依赖缺失错误
|
||||||
- **WHEN** 出现 ModuleNotFoundError
|
- **WHEN** 出现 ModuleNotFoundError
|
||||||
- **THEN** 错误处理表格说明需要安装对应的依赖包
|
- **THEN** 错误处理表格说明脚本会自动检测并安装依赖
|
||||||
|
|
||||||
#### Scenario: 文件类型不支持
|
#### Scenario: 文件类型不支持
|
||||||
- **WHEN** 出现"不支持的文件类型"错误
|
- **WHEN** 出现"不支持的文件类型"错误
|
||||||
|
|||||||
109
openspec/specs/skill-packaging/spec.md
Normal file
109
openspec/specs/skill-packaging/spec.md
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
提供自动化的 skill 打包能力,将 skill/SKILL.md 和 scripts/ 目录打包到 build/ 目录,便于 skill 分发。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: build.py 一键打包
|
||||||
|
系统 SHALL 提供 build.py 脚本,运行后完成 skill 的完整打包流程。
|
||||||
|
|
||||||
|
#### Scenario: 运行 build.py 成功
|
||||||
|
- **WHEN** 用户执行 `uv run --with pyarmor python build.py`
|
||||||
|
- **THEN** 脚本完成所有打包步骤并输出成功信息
|
||||||
|
|
||||||
|
### Requirement: 构建目录清理重建
|
||||||
|
系统 SHALL 在每次构建前删除整个 build 目录,然后重建空的 build 目录。
|
||||||
|
|
||||||
|
#### Scenario: 删除并重建 build 目录
|
||||||
|
- **WHEN** 构建开始
|
||||||
|
- **THEN** 脚本删除整个 build 目录(如有),然后创建新的空 build 目录
|
||||||
|
|
||||||
|
### Requirement: SKILL.md 动态生成
|
||||||
|
系统 SHALL 读取 SKILL.md 模板,动态注入 version 和 author 字段后写入 build/SKILL.md。
|
||||||
|
|
||||||
|
#### Scenario: SKILL.md 包含动态元数据
|
||||||
|
- **WHEN** 构建执行
|
||||||
|
- **THEN** build/SKILL.md 的 metadata 包含 version 和 author 字段
|
||||||
|
|
||||||
|
#### Scenario: version 是时间戳
|
||||||
|
- **WHEN** 构建在 2026年3月11日 14点30分22秒执行
|
||||||
|
- **THEN** build/SKILL.md 中 `metadata.version` 值为 "20260311_143022"
|
||||||
|
|
||||||
|
#### Scenario: author 来自 git 配置
|
||||||
|
- **WHEN** git config user.name 是 "Your Name",git config user.email 是 "your@email.com"
|
||||||
|
- **THEN** build/SKILL.md 中 `metadata.author` 值为 "Your Name <your@email.com>"
|
||||||
|
|
||||||
|
### Requirement: git 配置读取
|
||||||
|
系统 SHALL 从 git config 读取 user.name 和 user.email。
|
||||||
|
|
||||||
|
#### Scenario: git config 读取成功
|
||||||
|
- **WHEN** git config 已设置 user.name 和 user.email
|
||||||
|
- **THEN** 系统读取到正确的值
|
||||||
|
|
||||||
|
#### Scenario: git config 未设置
|
||||||
|
- **WHEN** git config user.name 或 user.email 未设置
|
||||||
|
- **THEN** 系统显示错误信息并退出
|
||||||
|
|
||||||
|
### Requirement: 时间戳版本号
|
||||||
|
系统 SHALL 生成 YYYYMMDD_HHMMSS 格式的时间戳作为构建版本标识。
|
||||||
|
|
||||||
|
#### Scenario: 时间戳格式正确
|
||||||
|
- **WHEN** 构建在 2025年3月9日 14点30分22秒执行
|
||||||
|
- **THEN** 生成的版本号为 20250309_143022
|
||||||
|
|
||||||
|
### Requirement: 输出构建信息
|
||||||
|
系统 SHALL 在构建完成后打印版本号和构建结果信息。
|
||||||
|
|
||||||
|
#### Scenario: 显示构建信息
|
||||||
|
- **WHEN** 构建成功完成
|
||||||
|
- **THEN** 控制台输出版本号和作者信息
|
||||||
|
|
||||||
|
### Requirement: 仅混淆构建
|
||||||
|
系统 SHALL 仅提供混淆构建模式,移除非混淆选项。
|
||||||
|
|
||||||
|
#### Scenario: build.py 始终混淆
|
||||||
|
- **WHEN** 用户执行 `uv run --with pyarmor python build.py`
|
||||||
|
- **THEN** 系统使用 PyArmor 混淆 scripts 目录代码
|
||||||
|
|
||||||
|
#### Scenario: 无 --obfuscate 参数
|
||||||
|
- **WHEN** 用户运行 build.py
|
||||||
|
- **THEN** 系统不需要 --obfuscate 参数,直接执行混淆构建
|
||||||
|
|
||||||
|
### Requirement: PyArmor 混淆执行
|
||||||
|
系统 SHALL 调用 PyArmor 工具对 scripts 目录进行混淆,然后将 pyarmor_runtime 目录移动到 scripts 内部。
|
||||||
|
|
||||||
|
#### Scenario: PyArmor 成功执行
|
||||||
|
- **WHEN** PyArmor 可用
|
||||||
|
- **THEN** 系统执行 pyarmor gen --recursive 命令
|
||||||
|
|
||||||
|
#### Scenario: 混淆后文件输出
|
||||||
|
- **WHEN** PyArmor 混淆完成
|
||||||
|
- **THEN** build/ 目录包含混淆后的 scripts 目录,且 pyarmor_runtime 子目录位于 scripts/ 内部
|
||||||
|
|
||||||
|
### Requirement: PyArmor 未安装友好提示
|
||||||
|
系统 SHALL 在 PyArmor 未安装时提供清晰的错误提示,引导用户正确使用 `uv run --with pyarmor`。
|
||||||
|
|
||||||
|
#### Scenario: PyArmor ImportError
|
||||||
|
- **WHEN** 未通过 --with pyarmor 加载
|
||||||
|
- **THEN** 系统显示友好错误信息,提示正确命令
|
||||||
|
|
||||||
|
### Requirement: SKILL.md 保持明文
|
||||||
|
系统 SHALL 在混淆模式下仍然将 SKILL.md 作为明文文件复制,不进行混淆。
|
||||||
|
|
||||||
|
#### Scenario: SKILL.md 保持明文
|
||||||
|
- **WHEN** 启用混淆执行构建
|
||||||
|
- **THEN** build/SKILL.md 文件为明文,内容包含动态注入的元数据
|
||||||
|
|
||||||
|
### Requirement: 混淆错误处理
|
||||||
|
系统 SHALL 在 PyArmor 混淆失败时捕获错误并显示详细信息。
|
||||||
|
|
||||||
|
#### Scenario: PyArmor 命令失败
|
||||||
|
- **WHEN** pyarmor 命令执行返回非零退出码
|
||||||
|
- **THEN** 系统显示退出码、标准输出和错误输出信息
|
||||||
|
|
||||||
|
### Requirement: 一键发布脚本
|
||||||
|
系统 SHALL 提供 publish.sh 脚本,一键执行混淆构建并发布。
|
||||||
|
|
||||||
|
#### Scenario: publish.sh 执行成功
|
||||||
|
- **WHEN** 用户执行 `./publish.sh`
|
||||||
|
- **THEN** 系统依次执行混淆构建和发布
|
||||||
82
openspec/specs/skill-publishing/spec.md
Normal file
82
openspec/specs/skill-publishing/spec.md
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
提供 skill 发布到目标 GitHub 仓库的能力,自动化将 build/ 目录内容同步到 skills 仓库。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: publish.py 一键发布
|
||||||
|
系统 SHALL 提供 publish.py 脚本,运行后将 build/ 目录内容发布到目标仓库。
|
||||||
|
|
||||||
|
#### Scenario: 运行 publish.py 成功
|
||||||
|
- **WHEN** 用户执行 `uv run python publish.py`
|
||||||
|
- **THEN** 脚本完成所有发布步骤并输出成功信息
|
||||||
|
|
||||||
|
### Requirement: 使用临时目录 clone
|
||||||
|
系统 SHALL 在系统临时目录创建临时文件夹,用于 clone 目标仓库。
|
||||||
|
|
||||||
|
#### Scenario: 临时目录自动清理
|
||||||
|
- **WHEN** 发布完成或失败
|
||||||
|
- **THEN** 临时目录被自动清理
|
||||||
|
|
||||||
|
### Requirement: shallow clone
|
||||||
|
系统 SHALL 使用 `--depth 1` 参数 clone 目标仓库,加快 clone 速度。
|
||||||
|
|
||||||
|
#### Scenario: clone 参数正确
|
||||||
|
- **WHEN** 执行 git clone
|
||||||
|
- **THEN** 命令包含 `--depth 1` 参数
|
||||||
|
|
||||||
|
### Requirement: 目标仓库配置
|
||||||
|
系统 SHALL 硬编码目标仓库 URL 为 `https://github.com/lanyuanxiaoyao/skills.git`。
|
||||||
|
|
||||||
|
#### Scenario: 目标仓库正确
|
||||||
|
- **WHEN** publish.py 执行 clone
|
||||||
|
- **THEN** clone 的仓库地址是 `https://github.com/lanyuanxiaoyao/skills.git`
|
||||||
|
|
||||||
|
### Requirement: 目标路径配置
|
||||||
|
系统 SHALL 将内容发布到目标仓库的 `skills/lyxy-document-reader/` 路径。
|
||||||
|
|
||||||
|
#### Scenario: 目标路径正确
|
||||||
|
- **WHEN** 文件同步完成
|
||||||
|
- **THEN** 文件位于 `skills/lyxy-document-reader/` 目录下
|
||||||
|
|
||||||
|
### Requirement: 清空目标路径
|
||||||
|
系统 SHALL 在复制前清空 `skills/lyxy-document-reader/` 目录内容。
|
||||||
|
|
||||||
|
#### Scenario: 旧文件被清理
|
||||||
|
- **WHEN** 开始同步文件
|
||||||
|
- **THEN** 目标目录下的旧文件被删除
|
||||||
|
|
||||||
|
### Requirement: 从 SKILL.md 读取版本号
|
||||||
|
系统 SHALL 解析 build/SKILL.md 的 YAML frontmatter 获取 version 字段。
|
||||||
|
|
||||||
|
#### Scenario: 版本号读取成功
|
||||||
|
- **WHEN** build/SKILL.md 包含 `metadata.version: "20260311_143022"`
|
||||||
|
- **THEN** publish.py 读取到版本号 "20260311_143022"
|
||||||
|
|
||||||
|
### Requirement: git 提交信息
|
||||||
|
系统 SHALL 使用包含版本号的 commit message,格式为 `publish: lyxy-document-reader <version>`。
|
||||||
|
|
||||||
|
#### Scenario: commit message 正确
|
||||||
|
- **WHEN** 版本号是 20260311_143022
|
||||||
|
- **THEN** commit message 是 `publish: lyxy-document-reader 20260311_143022`
|
||||||
|
|
||||||
|
### Requirement: git 提交并推送
|
||||||
|
系统 SHALL 执行 git add、git commit 和 git push 操作。
|
||||||
|
|
||||||
|
#### Scenario: git 操作成功
|
||||||
|
- **WHEN** 文件同步完成
|
||||||
|
- **THEN** 系统执行 git add .、git commit 和 git push
|
||||||
|
|
||||||
|
### Requirement: build 目录存在检查
|
||||||
|
系统 SHALL 在开始前检查 build/ 目录是否存在,不存在则提示错误。
|
||||||
|
|
||||||
|
#### Scenario: build 目录不存在
|
||||||
|
- **WHEN** build/ 目录不存在
|
||||||
|
- **THEN** 脚本显示错误信息并退出
|
||||||
|
|
||||||
|
### Requirement: SKILL.md 存在检查
|
||||||
|
系统 SHALL 检查 build/SKILL.md 是否存在,不存在则提示错误。
|
||||||
|
|
||||||
|
#### Scenario: build/SKILL.md 不存在
|
||||||
|
- **WHEN** build/SKILL.md 不存在
|
||||||
|
- **THEN** 脚本显示错误信息并退出
|
||||||
@@ -6,6 +6,23 @@
|
|||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: 测试运行器包含 fixtures 依赖
|
||||||
|
run_tests.py 必须定义 TEST_FIXTURE_DEPENDENCIES 常量,包含创建临时测试文件所需的所有依赖。
|
||||||
|
|
||||||
|
#### Scenario: TEST_FIXTURE_DEPENDENCIES 定义存在
|
||||||
|
- **WHEN** 查看 run_tests.py
|
||||||
|
- **THEN** 存在 TEST_FIXTURE_DEPENDENCIES 常量
|
||||||
|
- **AND** 包含 python-docx(用于创建临时 DOCX)
|
||||||
|
- **AND** 包含 reportlab(用于创建临时 PDF)
|
||||||
|
- **AND** 包含 pandas(用于创建临时 XLSX)
|
||||||
|
- **AND** 包含 openpyxl(pandas 写 XLSX 需要)
|
||||||
|
- **AND** 包含 python-pptx(用于创建临时 PPTX)
|
||||||
|
|
||||||
|
#### Scenario: fixtures 依赖与文件类型依赖合并
|
||||||
|
- **WHEN** 运行任何类型的测试
|
||||||
|
- **THEN** TEST_FIXTURE_DEPENDENCIES 中的依赖自动合并到 uv run --with 参数中
|
||||||
|
- **AND** 去重处理,避免重复添加
|
||||||
|
|
||||||
### Requirement: 临时文件自动清理
|
### Requirement: 临时文件自动清理
|
||||||
测试使用的临时文件 MUST 在测试完成后自动清理,使用 pytest 的 tmp_path fixture。
|
测试使用的临时文件 MUST 在测试完成后自动清理,使用 pytest 的 tmp_path fixture。
|
||||||
|
|
||||||
@@ -59,6 +76,18 @@ tests/test_readers/conftest.py MUST 提供 Reader 测试专用的 fixtures。
|
|||||||
- **WHEN** 测试需要临时 XLSX 文件
|
- **WHEN** 测试需要临时 XLSX 文件
|
||||||
- **THEN** 可以使用 `temp_xlsx` fixture 创建临时 XLSX 文件
|
- **THEN** 可以使用 `temp_xlsx` fixture 创建临时 XLSX 文件
|
||||||
|
|
||||||
|
#### Scenario: 提供 doc 静态文件 fixtures
|
||||||
|
- **WHEN** 测试需要 doc 静态测试文件
|
||||||
|
- **THEN** 可以使用 `simple_doc_path`、`with_headings_doc_path`、`with_table_doc_path`
|
||||||
|
|
||||||
|
#### Scenario: 提供 xls 静态文件 fixtures
|
||||||
|
- **WHEN** 测试需要 xls 静态测试文件
|
||||||
|
- **THEN** 可以使用 `simple_xls_path`、`multiple_sheets_xls_path`、`with_formulas_xls_path`
|
||||||
|
|
||||||
|
#### Scenario: 提供 ppt 静态文件 fixtures
|
||||||
|
- **WHEN** 测试需要 ppt 静态测试文件
|
||||||
|
- **THEN** 可以使用 `simple_ppt_path`、`multiple_slides_ppt_path`、`with_images_ppt_path`
|
||||||
|
|
||||||
### Requirement: CLI 专用 fixtures
|
### Requirement: CLI 专用 fixtures
|
||||||
tests/test_cli/conftest.py MUST 提供 CLI 测试专用的 fixtures。
|
tests/test_cli/conftest.py MUST 提供 CLI 测试专用的 fixtures。
|
||||||
|
|
||||||
@@ -106,3 +135,54 @@ temp_html fixture MUST 支持创建包含各种元素的 HTML 文件。
|
|||||||
#### Scenario: 创建包含标题和段落的 HTML
|
#### Scenario: 创建包含标题和段落的 HTML
|
||||||
- **WHEN** 调用 `temp_html(content="<h1>标题</h1><p>段落</p>")`
|
- **WHEN** 调用 `temp_html(content="<h1>标题</h1><p>段落</p>")`
|
||||||
- **THEN** 创建包含指定内容的 HTML 文件
|
- **THEN** 创建包含指定内容的 HTML 文件
|
||||||
|
|
||||||
|
### Requirement: 静态测试文件目录结构
|
||||||
|
项目 MUST 在 `tests/test_readers/fixtures/` 下按格式类型组织静态测试文件。
|
||||||
|
|
||||||
|
#### Scenario: doc 静态文件目录
|
||||||
|
- **WHEN** 查看 `tests/test_readers/fixtures/doc/`
|
||||||
|
- **THEN** 目录存在且包含 .doc 静态测试文件
|
||||||
|
|
||||||
|
#### Scenario: xls 静态文件目录
|
||||||
|
- **WHEN** 查看 `tests/test_readers/fixtures/xls/`
|
||||||
|
- **THEN** 目录存在且包含 .xls 静态测试文件
|
||||||
|
|
||||||
|
#### Scenario: ppt 静态文件目录
|
||||||
|
- **WHEN** 查看 `tests/test_readers/fixtures/ppt/`
|
||||||
|
- **THEN** 目录存在且包含 .ppt 静态测试文件
|
||||||
|
|
||||||
|
### Requirement: fixtures 目录所有文件纳入 Git LFS
|
||||||
|
`tests/test_readers/fixtures/` 目录下的 ALL 文件 MUST 纳入 Git LFS 管理。
|
||||||
|
|
||||||
|
#### Scenario: .gitattributes 配置正确
|
||||||
|
- **WHEN** 查看 `.gitattributes`
|
||||||
|
- **THEN** 包含 `tests/test_readers/fixtures/**/*` 的 LFS 配置,匹配该目录下所有文件
|
||||||
|
|
||||||
|
### Requirement: fixtures 目录仅存放静态文件
|
||||||
|
`tests/test_readers/fixtures/` 目录 MUST 仅用于存放预先准备的静态测试文件,禁止在测试中向该目录动态生成临时文件。
|
||||||
|
|
||||||
|
#### Scenario: 不向 fixtures 目录写入临时文件
|
||||||
|
- **WHEN** 测试运行时
|
||||||
|
- **THEN** 不会在 `tests/test_readers/fixtures/` 下创建或修改文件
|
||||||
|
- **AND** 临时文件使用 pytest 的 tmp_path 在其他位置创建
|
||||||
|
|
||||||
|
### Requirement: 静态测试文件 Fixture
|
||||||
|
`tests/test_readers/conftest.py` MUST 提供访问静态测试文件的 fixtures。
|
||||||
|
|
||||||
|
#### Scenario: 提供目录路径 fixture
|
||||||
|
- **WHEN** 测试需要访问静态文件目录
|
||||||
|
- **THEN** 可以使用 `doc_fixture_path`、`xls_fixture_path`、`ppt_fixture_path` 获取对应目录路径
|
||||||
|
|
||||||
|
#### Scenario: 提供单个文件 fixture
|
||||||
|
- **WHEN** 测试需要访问特定静态文件
|
||||||
|
- **THEN** 可以使用 `simple_doc_path`、`with_headings_doc_path` 等便捷 fixture
|
||||||
|
- **AND** 文件不存在时自动 pytest.skip
|
||||||
|
|
||||||
|
### Requirement: fixtures 使用规范写入开发文档
|
||||||
|
fixtures 目录的使用规范 MUST 写入 README.md 开发文档。
|
||||||
|
|
||||||
|
#### Scenario: README 包含 fixtures 规范
|
||||||
|
- **WHEN** 查看 README.md
|
||||||
|
- **THEN** 包含 fixtures 目录的用途说明
|
||||||
|
- **AND** 包含静态文件与临时文件的区别说明
|
||||||
|
- **AND** 包含 Git LFS 配置说明
|
||||||
|
|||||||
69
openspec/specs/test-runner/spec.md
Normal file
69
openspec/specs/test-runner/spec.md
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
# Test Runner Specification
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
定义自动化测试运行器的功能规范,包括测试类型选择、依赖自动加载、pytest 参数透传等。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: 测试运行器支持指定测试类型
|
||||||
|
测试运行器 SHALL 支持通过命令行参数指定测试类型,自动加载对应依赖并运行 pytest。
|
||||||
|
|
||||||
|
#### Scenario: 运行 PDF 测试
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py pdf`
|
||||||
|
- **THEN** 自动加载 config.DEPENDENCIES["pdf"] 中的依赖
|
||||||
|
- **AND** 自动加载测试 fixtures 所需的依赖
|
||||||
|
- **AND** 运行 tests/test_readers/test_pdf/ 目录下的测试
|
||||||
|
|
||||||
|
#### Scenario: 运行 DOCX 测试
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py docx`
|
||||||
|
- **THEN** 自动加载 config.DEPENDENCIES["docx"] 中的依赖
|
||||||
|
- **AND** 自动加载测试 fixtures 所需的依赖
|
||||||
|
- **AND** 运行 tests/test_readers/test_docx/ 目录下的测试
|
||||||
|
|
||||||
|
#### Scenario: 运行 CLI 测试(无特殊依赖)
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py cli`
|
||||||
|
- **THEN** 加载 pytest 依赖
|
||||||
|
- **AND** 自动加载测试 fixtures 所需的依赖
|
||||||
|
- **AND** 加载 config.DEPENDENCIES 中所有类型的依赖(去重)
|
||||||
|
- **AND** 运行 tests/test_cli/ 目录下的测试
|
||||||
|
|
||||||
|
#### Scenario: 运行所有测试
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py all`
|
||||||
|
- **THEN** 加载 config.DEPENDENCIES 中所有类型的依赖(去重)
|
||||||
|
- **AND** 自动加载测试 fixtures 所需的依赖
|
||||||
|
- **AND** 运行 tests/ 目录下的所有测试
|
||||||
|
|
||||||
|
### Requirement: 测试运行器支持透传 pytest 参数
|
||||||
|
测试运行器 SHALL 支持将额外的命令行参数透传给 pytest。
|
||||||
|
|
||||||
|
#### Scenario: 传递 -v 参数
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py pdf -v`
|
||||||
|
- **THEN** pytest 以 verbose 模式运行
|
||||||
|
|
||||||
|
#### Scenario: 传递 --cov 参数
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py pdf --cov=scripts`
|
||||||
|
- **THEN** pytest 生成测试覆盖率报告
|
||||||
|
|
||||||
|
#### Scenario: 运行特定测试文件
|
||||||
|
- **WHEN** 用户执行 `python run_tests.py pdf tests/test_readers/test_pdf/test_docling_pdf.py`
|
||||||
|
- **THEN** 仅运行指定的测试文件
|
||||||
|
|
||||||
|
### Requirement: 测试运行器支持平台特定配置
|
||||||
|
测试运行器 SHALL 根据当前平台自动选择对应的依赖配置(如 Darwin-x86_64)。
|
||||||
|
|
||||||
|
#### Scenario: 在 Darwin-x86_64 平台运行 PDF 测试
|
||||||
|
- **WHEN** 用户在 Darwin-x86_64 平台执行 `python run_tests.py pdf`
|
||||||
|
- **THEN** 使用 config.DEPENDENCIES["pdf"]["Darwin-x86_64"] 配置(如果存在)
|
||||||
|
- **AND** 使用 python 3.12(如配置中指定)
|
||||||
|
|
||||||
|
### Requirement: advice_generator 包含完整 Reader 映射
|
||||||
|
advice_generator.py 中的 _READER_KEY_MAP SHALL 包含所有 Reader 类的映射,包括 DocReader 和 PptReader。
|
||||||
|
|
||||||
|
#### Scenario: DocReader 映射存在
|
||||||
|
- **WHEN** 查询 _READER_KEY_MAP[DocReader]
|
||||||
|
- **THEN** 返回 "doc"
|
||||||
|
|
||||||
|
#### Scenario: PptReader 映射存在
|
||||||
|
- **WHEN** 查询 _READER_KEY_MAP[PptReader]
|
||||||
|
- **THEN** 返回 "ppt"
|
||||||
103
openspec/specs/uv-with-dependency-management/spec.md
Normal file
103
openspec/specs/uv-with-dependency-management/spec.md
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
# UV --with 依赖管理
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
基于文档的依赖管理方式,使用 `uv run --with` 按需加载依赖。移除 pyproject.toml 和 uv.lock,通过 SKILL.md 和 README.md 提供完整的依赖说明和命令示例。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: 文档驱动的依赖声明
|
||||||
|
系统必须在 SKILL.md 和 README.md 中明确说明每种文档格式和平台所需的依赖包。README.md 面向开发者,提供开发和测试的命令示例;SKILL.md 面向 AI,强调 --advice 优先。
|
||||||
|
|
||||||
|
#### Scenario: SKILL.md 强调 --advice 而非完整命令
|
||||||
|
- **WHEN** AI 或用户阅读 SKILL.md
|
||||||
|
- **THEN** 文档引导使用 --advice 获取准确命令,不提供完整的 `uv run --with` 命令块
|
||||||
|
- **AND** 仅保留简洁的参数使用示例
|
||||||
|
|
||||||
|
#### Scenario: README.md 包含开发依赖速查表
|
||||||
|
- **WHEN** 开发者阅读 README.md
|
||||||
|
- **THEN** 文档必须提供测试每种格式的 `uv run --with` 命令示例
|
||||||
|
- **AND** 必须包含特殊平台的版本约束说明(如 macOS Intel)
|
||||||
|
|
||||||
|
### Requirement: 按需加载依赖
|
||||||
|
系统必须使用 `uv run --with` 方式按需加载依赖,无需预先安装 extras 组合。
|
||||||
|
|
||||||
|
#### Scenario: 运行 PDF 解析
|
||||||
|
- **WHEN** 用户执行 `uv run --with docling --with pypdf --with chardet scripts/lyxy_document_reader.py file.pdf`
|
||||||
|
- **THEN** 系统必须自动安装这些依赖(如果尚未安装)
|
||||||
|
- **AND** 必须成功执行脚本
|
||||||
|
|
||||||
|
#### Scenario: 测试 DOCX reader
|
||||||
|
- **WHEN** 开发者执行 `uv run --with docling --with python-docx ... pytest tests/test_readers/test_docx/`
|
||||||
|
- **THEN** 系统必须只安装指定的依赖
|
||||||
|
- **AND** 必须成功运行测试
|
||||||
|
|
||||||
|
### Requirement: 平台特定版本约束
|
||||||
|
系统必须在文档和命令中明确说明特殊平台的版本约束。
|
||||||
|
|
||||||
|
#### Scenario: macOS Intel 的 PDF 解析
|
||||||
|
- **WHEN** 用户在 macOS x86_64 平台阅读 PDF 解析说明
|
||||||
|
- **THEN** 文档必须明确说明需要 Python 3.12
|
||||||
|
- **AND** 命令必须包含版本约束:`--with "docling==2.40.0" --with "docling-parse==4.0.0" --with "numpy<2"`
|
||||||
|
- **AND** 必须说明原因:docling-parse 5.x 无 x86_64 wheel
|
||||||
|
|
||||||
|
#### Scenario: 其他平台使用最新版本
|
||||||
|
- **WHEN** 用户在 macOS ARM 或 Linux 平台
|
||||||
|
- **THEN** 命令可以省略版本号,使用最新兼容版本
|
||||||
|
- **AND** 文档必须说明这是可行的
|
||||||
|
|
||||||
|
### Requirement: 移除 pyproject.toml
|
||||||
|
系统必须移除 pyproject.toml 文件,不再使用 extras 声明依赖。
|
||||||
|
|
||||||
|
#### Scenario: 项目根目录不包含 pyproject.toml
|
||||||
|
- **WHEN** 用户查看项目根目录
|
||||||
|
- **THEN** 系统必须不包含 pyproject.toml 文件
|
||||||
|
|
||||||
|
#### Scenario: 依赖说明不在 pyproject.toml
|
||||||
|
- **WHEN** 用户尝试查找依赖声明
|
||||||
|
- **THEN** 系统必须引导用户查阅 SKILL.md 或 README.md
|
||||||
|
|
||||||
|
### Requirement: 移除 uv.lock
|
||||||
|
系统必须移除 uv.lock 文件,每次 `uv run` 都是全新的依赖解析。
|
||||||
|
|
||||||
|
#### Scenario: 项目不包含 uv.lock
|
||||||
|
- **WHEN** 用户查看项目根目录
|
||||||
|
- **THEN** 系统必须不包含 uv.lock 文件
|
||||||
|
|
||||||
|
#### Scenario: 依赖版本由文档说明
|
||||||
|
- **WHEN** 用户需要了解依赖版本约束
|
||||||
|
- **THEN** 系统必须在 SKILL.md 或 README.md 中说明
|
||||||
|
- **AND** 不依赖 uv.lock 锁定版本
|
||||||
|
|
||||||
|
### Requirement: 核心 chardet 依赖
|
||||||
|
系统必须在所有 `uv run --with` 命令中包含 chardet 依赖。
|
||||||
|
|
||||||
|
#### Scenario: 所有格式都包含 chardet
|
||||||
|
- **WHEN** 用户查阅任何格式的依赖命令
|
||||||
|
- **THEN** 命令必须包含 `--with chardet`
|
||||||
|
|
||||||
|
### Requirement: 当前平台命令验证
|
||||||
|
系统必须验证当前平台的 `uv run --with` 命令可以正确执行。
|
||||||
|
|
||||||
|
#### Scenario: 验证 default 平台命令
|
||||||
|
- **WHEN** 在当前平台执行 `uv run --with` 命令
|
||||||
|
- **THEN** 必须可以成功安装所有依赖
|
||||||
|
- **AND** 必须可以成功运行文档解析脚本
|
||||||
|
|
||||||
|
#### Scenario: 记录当前平台命令
|
||||||
|
- **WHEN** 更新 SKILL.md 或 README.md
|
||||||
|
- **THEN** 必须包含当前平台的命令示例
|
||||||
|
- **AND** 命令中的依赖版本必须与 `config.DEPENDENCIES` 一致
|
||||||
|
|
||||||
|
### Requirement: 版本一致性
|
||||||
|
SKILL.md 和 README.md 中的依赖版本必须与 `config.DEPENDENCIES` 中指定的版本一致。
|
||||||
|
|
||||||
|
#### Scenario: 文档中的版本与配置一致
|
||||||
|
- **WHEN** 查看 SKILL.md 或 README.md 中的 `uv run --with` 命令示例
|
||||||
|
- **THEN** 命令中指定的依赖版本必须与 `config.DEPENDENCIES` 中 default 配置的版本一致
|
||||||
|
- **AND** 如果配置中指定了特定版本,文档中必须使用相同版本
|
||||||
|
|
||||||
|
#### Scenario: 更新依赖时同步更新文档
|
||||||
|
- **WHEN** 更新 `config.DEPENDENCIES` 中的依赖版本
|
||||||
|
- **THEN** 必须同步更新 SKILL.md 和 README.md 中的相关命令示例
|
||||||
|
- **AND** 必须更新 `docs/upgrade-deps-prompt.md` 中的版本记录
|
||||||
83
openspec/specs/xls-reader/spec.md
Normal file
83
openspec/specs/xls-reader/spec.md
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
## Purpose
|
||||||
|
|
||||||
|
XLS 文档解析能力,支持解析 Microsoft Excel 97-2003 旧格式文档。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Requirement: XLS 文档解析
|
||||||
|
系统 SHALL 支持解析 .xls 格式文档,按优先级尝试多个解析器。
|
||||||
|
|
||||||
|
#### Scenario: 按优先级尝试解析器
|
||||||
|
- **WHEN** 解析 XLS 文档
|
||||||
|
- **THEN** 系统按 unstructured → markitdown → pandas+xlrd 的顺序尝试
|
||||||
|
|
||||||
|
#### Scenario: 成功解析
|
||||||
|
- **WHEN** 任一解析器成功
|
||||||
|
- **THEN** 系统返回解析结果
|
||||||
|
|
||||||
|
#### Scenario: 所有解析器失败
|
||||||
|
- **WHEN** 所有解析器均失败
|
||||||
|
- **THEN** 系统返回失败列表并退出非零状态码
|
||||||
|
|
||||||
|
### Requirement: unstructured 解析器
|
||||||
|
系统 SHALL 支持使用 unstructured 库解析 XLS。
|
||||||
|
|
||||||
|
#### Scenario: unstructured 解析成功
|
||||||
|
- **WHEN** unstructured 库可用且文档有效
|
||||||
|
- **THEN** 系统返回 Markdown 内容
|
||||||
|
|
||||||
|
#### Scenario: unstructured 库未安装
|
||||||
|
- **WHEN** unstructured 库未安装
|
||||||
|
- **THEN** 系统尝试下一个解析器
|
||||||
|
|
||||||
|
### Requirement: markitdown 解析器
|
||||||
|
系统 SHALL 支持使用 markitdown 库解析 XLS。
|
||||||
|
|
||||||
|
#### Scenario: markitdown 解析成功
|
||||||
|
- **WHEN** markitdown 库可用且文档有效
|
||||||
|
- **THEN** 系统返回 Markdown 内容
|
||||||
|
|
||||||
|
#### Scenario: markitdown 库未安装
|
||||||
|
- **WHEN** markitdown 库未安装
|
||||||
|
- **THEN** 系统尝试下一个解析器
|
||||||
|
|
||||||
|
### Requirement: pandas+xlrd 解析器
|
||||||
|
系统 SHALL 支持使用 pandas + xlrd 库解析 XLS。
|
||||||
|
|
||||||
|
#### Scenario: pandas+xlrd 解析成功
|
||||||
|
- **WHEN** pandas 和 xlrd 库可用且文档有效
|
||||||
|
- **THEN** 系统返回 Markdown 格式的表格内容,包含所有工作表
|
||||||
|
|
||||||
|
#### Scenario: pandas 或 xlrd 库未安装
|
||||||
|
- **WHEN** pandas 或 xlrd 库未安装
|
||||||
|
- **THEN** 系统尝试下一个解析器
|
||||||
|
|
||||||
|
### Requirement: 每个解析器独立文件
|
||||||
|
系统 SHALL 将每个解析器实现为独立的单文件模块。
|
||||||
|
|
||||||
|
#### Scenario: unstructured 解析器在独立文件
|
||||||
|
- **WHEN** 使用 unstructured 解析器
|
||||||
|
- **THEN** 从 readers/xls/unstructured.py 导入
|
||||||
|
|
||||||
|
#### Scenario: markitdown 解析器在独立文件
|
||||||
|
- **WHEN** 使用 markitdown 解析器
|
||||||
|
- **THEN** 从 readers/xls/markitdown.py 导入
|
||||||
|
|
||||||
|
#### Scenario: pandas 解析器在独立文件
|
||||||
|
- **WHEN** 使用 pandas 解析器
|
||||||
|
- **THEN** 从 readers/xls/pandas.py 导入
|
||||||
|
|
||||||
|
### Requirement: XLS Reader 测试使用静态文件
|
||||||
|
XLS Reader 测试 MUST 使用 `tests/test_readers/fixtures/xls/` 下的静态文件。
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 simple.xls
|
||||||
|
- **WHEN** 测试 XLS Reader 基础解析能力
|
||||||
|
- **THEN** 使用 `simple.xls` 静态文件
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 multiple_sheets.xls
|
||||||
|
- **WHEN** 测试 XLS Reader 多工作表解析
|
||||||
|
- **THEN** 使用 `multiple_sheets.xls` 静态文件
|
||||||
|
|
||||||
|
#### Scenario: 测试使用 with_formulas.xls
|
||||||
|
- **WHEN** 测试 XLS Reader 公式结果读取
|
||||||
|
- **THEN** 使用 `with_formulas.xls` 静态文件
|
||||||
244
publish.py
Normal file
244
publish.py
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Skill 发布脚本
|
||||||
|
|
||||||
|
使用方式:
|
||||||
|
uv run python publish.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
|
TARGET_REPO_URL = "https://github.com/lanyuanxiaoyao/skills.git"
|
||||||
|
TARGET_PATH = "skills/lyxy-document-reader"
|
||||||
|
|
||||||
|
|
||||||
|
def check_build_dir(build_dir: str) -> None:
|
||||||
|
"""
|
||||||
|
检查 build/ 目录是否存在
|
||||||
|
|
||||||
|
Args:
|
||||||
|
build_dir: build 目录路径
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
SystemExit: 目录不存在时退出
|
||||||
|
"""
|
||||||
|
if not os.path.exists(build_dir):
|
||||||
|
print("错误: build/ 目录不存在")
|
||||||
|
print("请先运行 build.py:")
|
||||||
|
print(" uv run python build.py")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def check_build_skill_md(build_skill_md_path: str) -> None:
|
||||||
|
"""
|
||||||
|
检查 build/SKILL.md 是否存在
|
||||||
|
|
||||||
|
Args:
|
||||||
|
build_skill_md_path: build/SKILL.md 路径
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
SystemExit: 文件不存在时退出
|
||||||
|
"""
|
||||||
|
if not os.path.exists(build_skill_md_path):
|
||||||
|
print("错误: build/SKILL.md 不存在")
|
||||||
|
print("请先运行 build.py:")
|
||||||
|
print(" uv run python build.py")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_version_from_skill_md(skill_md_path: str) -> str:
|
||||||
|
"""
|
||||||
|
从 SKILL.md 解析出版本号
|
||||||
|
|
||||||
|
Args:
|
||||||
|
skill_md_path: SKILL.md 路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
版本号字符串
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
SystemExit: 解析失败时退出
|
||||||
|
"""
|
||||||
|
with open(skill_md_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# 简单解析 YAML frontmatter 中的 version
|
||||||
|
lines = content.split("\n")
|
||||||
|
in_frontmatter = False
|
||||||
|
in_metadata = False
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped == "---":
|
||||||
|
if not in_frontmatter:
|
||||||
|
in_frontmatter = True
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
if in_frontmatter:
|
||||||
|
if stripped == "metadata:":
|
||||||
|
in_metadata = True
|
||||||
|
elif in_metadata and stripped.startswith("version:"):
|
||||||
|
# 提取版本号,去掉引号
|
||||||
|
version_part = stripped.split(":", 1)[1].strip()
|
||||||
|
version = version_part.strip('"').strip("'")
|
||||||
|
return version
|
||||||
|
elif in_metadata and stripped and not stripped.startswith(" "):
|
||||||
|
# metadata 块结束
|
||||||
|
in_metadata = False
|
||||||
|
|
||||||
|
print("错误: 无法从 build/SKILL.md 解析版本号")
|
||||||
|
print("请检查 build/SKILL.md 是否包含 metadata.version 字段")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def run_git_command(repo_dir: str, args: list[str]) -> subprocess.CompletedProcess:
|
||||||
|
"""
|
||||||
|
在指定目录运行 git 命令
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_dir: 仓库目录
|
||||||
|
args: git 命令参数列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
subprocess.CompletedProcess
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
subprocess.CalledProcessError: 命令失败时
|
||||||
|
"""
|
||||||
|
cmd = ["git"] + args
|
||||||
|
return subprocess.run(
|
||||||
|
cmd,
|
||||||
|
cwd=repo_dir,
|
||||||
|
check=True,
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def clone_repo(temp_dir: str) -> str:
|
||||||
|
"""
|
||||||
|
在临时目录 clone 目标仓库
|
||||||
|
|
||||||
|
Args:
|
||||||
|
temp_dir: 临时目录路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
仓库目录路径
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
SystemExit: clone 失败时退出
|
||||||
|
"""
|
||||||
|
repo_dir = os.path.join(temp_dir, "skills-repo")
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_git_command(temp_dir, ["clone", "--depth", "1", TARGET_REPO_URL, "skills-repo"])
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"错误: Clone 仓库失败")
|
||||||
|
print(f" 返回码: {e.returncode}")
|
||||||
|
print(f" 标准输出: {e.stdout}")
|
||||||
|
print(f" 错误输出: {e.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return repo_dir
|
||||||
|
|
||||||
|
|
||||||
|
def clear_target_dir(repo_dir: str) -> str:
|
||||||
|
"""
|
||||||
|
清空目标路径目录
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_dir: 仓库目录
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
目标目录路径
|
||||||
|
"""
|
||||||
|
target_dir = os.path.join(repo_dir, TARGET_PATH)
|
||||||
|
|
||||||
|
if os.path.exists(target_dir):
|
||||||
|
shutil.rmtree(target_dir)
|
||||||
|
|
||||||
|
os.makedirs(target_dir, exist_ok=True)
|
||||||
|
return target_dir
|
||||||
|
|
||||||
|
|
||||||
|
def copy_build_contents(build_dir: str, target_dir: str) -> None:
|
||||||
|
"""
|
||||||
|
复制 build/ 内容到目标目录
|
||||||
|
|
||||||
|
Args:
|
||||||
|
build_dir: build 源目录
|
||||||
|
target_dir: 目标目录
|
||||||
|
"""
|
||||||
|
for item in os.listdir(build_dir):
|
||||||
|
src = os.path.join(build_dir, item)
|
||||||
|
dst = os.path.join(target_dir, item)
|
||||||
|
|
||||||
|
if os.path.isdir(src):
|
||||||
|
shutil.copytree(src, dst)
|
||||||
|
else:
|
||||||
|
shutil.copy2(src, dst)
|
||||||
|
|
||||||
|
|
||||||
|
def git_commit_and_push(repo_dir: str, version: str) -> None:
|
||||||
|
"""
|
||||||
|
执行 git add / commit / push
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_dir: 仓库目录
|
||||||
|
version: 版本号
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
SystemExit: git 操作失败时退出
|
||||||
|
"""
|
||||||
|
commit_message = f"publish: lyxy-document-reader {version}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_git_command(repo_dir, ["add", "."])
|
||||||
|
run_git_command(repo_dir, ["commit", "-m", commit_message])
|
||||||
|
run_git_command(repo_dir, ["push"])
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"错误: Git 操作失败")
|
||||||
|
print(f" 返回码: {e.returncode}")
|
||||||
|
print(f" 标准输出: {e.stdout}")
|
||||||
|
print(f" 错误输出: {e.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""
|
||||||
|
主函数:执行完整的发布流程
|
||||||
|
"""
|
||||||
|
# 路径配置
|
||||||
|
project_root = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
build_dir = os.path.join(project_root, "build")
|
||||||
|
build_skill_md_path = os.path.join(build_dir, "SKILL.md")
|
||||||
|
|
||||||
|
# 检查 build/ 目录
|
||||||
|
check_build_dir(build_dir)
|
||||||
|
check_build_skill_md(build_skill_md_path)
|
||||||
|
|
||||||
|
# 解析版本号
|
||||||
|
version = parse_version_from_skill_md(build_skill_md_path)
|
||||||
|
|
||||||
|
# 使用临时目录
|
||||||
|
with tempfile.TemporaryDirectory(prefix="lyxy-publish-") as temp_dir:
|
||||||
|
# Clone 仓库
|
||||||
|
repo_dir = clone_repo(temp_dir)
|
||||||
|
|
||||||
|
# 清空目标路径
|
||||||
|
target_dir = clear_target_dir(repo_dir)
|
||||||
|
|
||||||
|
# 复制内容
|
||||||
|
copy_build_contents(build_dir, target_dir)
|
||||||
|
|
||||||
|
# Git 提交并推送
|
||||||
|
git_commit_and_push(repo_dir, version)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
18
publish.sh
Executable file
18
publish.sh
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# 混淆构建并发布脚本
|
||||||
|
#
|
||||||
|
# 使用方式:
|
||||||
|
# ./publish.sh
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
echo ">>> 构建 + 发布"
|
||||||
|
echo "[1/2] 构建..."
|
||||||
|
uv run --with pyarmor python build.py
|
||||||
|
echo "[2/2] 发布..."
|
||||||
|
uv run python publish.py
|
||||||
|
echo ">>> 完成"
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
[project]
|
|
||||||
name = "lyxy-document"
|
|
||||||
version = "0.1.0"
|
|
||||||
description = "帮助AI工具读取转换文档到markdown的skill"
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.11"
|
|
||||||
dependencies = [
|
|
||||||
"chardet>=5.0.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
|
||||||
docx = [
|
|
||||||
"docling>=2.0.0",
|
|
||||||
"unstructured>=0.12.0",
|
|
||||||
"markitdown>=0.1.0",
|
|
||||||
"pypandoc-binary>=1.13.0",
|
|
||||||
"python-docx>=1.1.0",
|
|
||||||
"markdownify>=0.12.0",
|
|
||||||
]
|
|
||||||
xlsx = [
|
|
||||||
"docling>=2.0.0",
|
|
||||||
"unstructured>=0.12.0",
|
|
||||||
"markitdown>=0.1.0",
|
|
||||||
"pandas>=2.0.0",
|
|
||||||
"tabulate>=0.9.0",
|
|
||||||
]
|
|
||||||
pptx = [
|
|
||||||
"docling>=2.0.0",
|
|
||||||
"unstructured>=0.12.0",
|
|
||||||
"markitdown>=0.1.0",
|
|
||||||
"python-pptx>=0.6.0",
|
|
||||||
"markdownify>=0.12.0",
|
|
||||||
]
|
|
||||||
pdf = [
|
|
||||||
"docling>=2.0.0",
|
|
||||||
"unstructured>=0.12.0",
|
|
||||||
"unstructured-paddleocr>=0.1.0",
|
|
||||||
"markitdown>=0.1.0",
|
|
||||||
"pypdf>=4.0.0",
|
|
||||||
"markdownify>=0.12.0",
|
|
||||||
]
|
|
||||||
html = [
|
|
||||||
"trafilatura>=1.10.0",
|
|
||||||
"domscribe>=0.1.0",
|
|
||||||
"markitdown>=0.1.0",
|
|
||||||
"html2text>=2024.2.26",
|
|
||||||
"beautifulsoup4>=4.12.0",
|
|
||||||
]
|
|
||||||
http = [
|
|
||||||
"httpx>=0.27.0",
|
|
||||||
"pyppeteer>=2.0.0",
|
|
||||||
"selenium>=4.18.0",
|
|
||||||
]
|
|
||||||
office = [
|
|
||||||
"lyxy-document[docx,xlsx,pptx,pdf]",
|
|
||||||
]
|
|
||||||
web = [
|
|
||||||
"lyxy-document[html,http]",
|
|
||||||
]
|
|
||||||
full = [
|
|
||||||
"lyxy-document[office,web]",
|
|
||||||
]
|
|
||||||
dev = [
|
|
||||||
"pytest>=8.0.0",
|
|
||||||
"pytest-cov>=4.1.0",
|
|
||||||
"reportlab>=4.0.0",
|
|
||||||
]
|
|
||||||
284
run_tests.py
Normal file
284
run_tests.py
Normal file
@@ -0,0 +1,284 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""测试运行器 - 自动根据测试类型加载依赖并运行 pytest"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 确定项目根目录和脚本路径
|
||||||
|
script_file = Path(__file__).resolve()
|
||||||
|
project_root = script_file.parent
|
||||||
|
scripts_dir = project_root / "scripts"
|
||||||
|
bootstrap_path = str(scripts_dir / "bootstrap.py")
|
||||||
|
|
||||||
|
# 将 scripts/ 目录添加到 sys.path
|
||||||
|
if str(scripts_dir) not in sys.path:
|
||||||
|
sys.path.append(str(scripts_dir))
|
||||||
|
|
||||||
|
# 抑制第三方库日志
|
||||||
|
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||||||
|
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||||
|
os.environ["TQDM_DISABLE"] = "1"
|
||||||
|
|
||||||
|
# 测试 fixtures 需要的依赖(用于创建临时测试文件)
|
||||||
|
TEST_FIXTURE_DEPENDENCIES = {
|
||||||
|
"default": [
|
||||||
|
"python-docx==1.2.0", # 用于创建临时 DOCX
|
||||||
|
"reportlab==4.2.2", # 用于创建临时 PDF
|
||||||
|
"pandas==3.0.1", # 用于创建临时 XLSX
|
||||||
|
"openpyxl==3.1.5", # pandas 写 XLSX 需要
|
||||||
|
"python-pptx==1.0.2", # 用于创建临时 PPTX
|
||||||
|
],
|
||||||
|
"Darwin-x86_64": [
|
||||||
|
"python-docx==1.2.0", # 用于创建临时 DOCX
|
||||||
|
"reportlab==4.2.2", # 用于创建临时 PDF
|
||||||
|
"pandas<3.0.0", # 用于创建临时 XLSX(兼容 Darwin-x86_64)
|
||||||
|
"openpyxl==3.1.5", # pandas 写 XLSX 需要
|
||||||
|
"python-pptx==1.0.2", # 用于创建临时 PPTX
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# 测试类型映射
|
||||||
|
_TEST_TYPES = {
|
||||||
|
# 文件类型测试(有依赖配置)
|
||||||
|
"pdf": {"key": "pdf", "path": "tests/test_readers/test_pdf/"},
|
||||||
|
"docx": {"key": "docx", "path": "tests/test_readers/test_docx/"},
|
||||||
|
"xlsx": {"key": "xlsx", "path": "tests/test_readers/test_xlsx/"},
|
||||||
|
"pptx": {"key": "pptx", "path": "tests/test_readers/test_pptx/"},
|
||||||
|
"html": {"key": "html", "path": "tests/test_readers/test_html/"},
|
||||||
|
"xls": {"key": "xls", "path": "tests/test_readers/test_xls/"},
|
||||||
|
"doc": {"key": "doc", "path": "tests/test_readers/test_doc/"},
|
||||||
|
"ppt": {"key": "ppt", "path": "tests/test_readers/test_ppt/"},
|
||||||
|
# 核心测试(cli 测试需要所有依赖,因为它测试多种格式)
|
||||||
|
"cli": {"key": "all", "path": "tests/test_cli/"},
|
||||||
|
"core": {"key": None, "path": "tests/test_core/"},
|
||||||
|
"utils": {"key": None, "path": "tests/test_utils/"},
|
||||||
|
# 所有测试(合并所有依赖)
|
||||||
|
"all": {"key": "all", "path": "tests/"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_all_dependencies(platform_id: str):
|
||||||
|
"""
|
||||||
|
收集所有文件类型的依赖并去重(内部辅助函数)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform_id: 平台标识
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(python_version, dependencies) 元组
|
||||||
|
"""
|
||||||
|
from config import DEPENDENCIES
|
||||||
|
|
||||||
|
python_version = None
|
||||||
|
all_deps = set()
|
||||||
|
for type_key, type_config in DEPENDENCIES.items():
|
||||||
|
# 先尝试特定平台配置
|
||||||
|
if platform_id in type_config:
|
||||||
|
cfg = type_config[platform_id]
|
||||||
|
elif "default" in type_config:
|
||||||
|
cfg = type_config["default"]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
# 记录 python 版本(优先使用有特殊要求的)
|
||||||
|
if cfg.get("python") and not python_version:
|
||||||
|
python_version = cfg["python"]
|
||||||
|
# 收集依赖
|
||||||
|
for dep in cfg.get("dependencies", []):
|
||||||
|
all_deps.add(dep)
|
||||||
|
return python_version, list(all_deps)
|
||||||
|
|
||||||
|
|
||||||
|
def get_dependencies_for_type(test_type: str, platform_id: str):
|
||||||
|
"""
|
||||||
|
获取指定测试类型的依赖配置(完全从 config.py 获取)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_type: 测试类型(pdf/docx/.../all)
|
||||||
|
platform_id: 平台标识
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(python_version, dependencies) 元组
|
||||||
|
"""
|
||||||
|
from config import DEPENDENCIES
|
||||||
|
|
||||||
|
config = _TEST_TYPES.get(test_type)
|
||||||
|
if not config:
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
key = config["key"]
|
||||||
|
|
||||||
|
if key is None:
|
||||||
|
# core/utils 测试不需要特殊依赖
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
if key == "all":
|
||||||
|
# cli 和 all 都使用收集所有依赖的逻辑
|
||||||
|
return _collect_all_dependencies(platform_id)
|
||||||
|
|
||||||
|
# 单个类型的依赖,完全从 config.py 获取
|
||||||
|
if key not in DEPENDENCIES:
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
type_config = DEPENDENCIES[key]
|
||||||
|
if platform_id in type_config:
|
||||||
|
cfg = type_config[platform_id]
|
||||||
|
elif "default" in type_config:
|
||||||
|
cfg = type_config["default"]
|
||||||
|
else:
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
return cfg.get("python"), cfg.get("dependencies", [])
|
||||||
|
|
||||||
|
|
||||||
|
def get_fixture_dependencies(platform_id: str):
|
||||||
|
"""
|
||||||
|
获取指定平台的 fixtures 依赖。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform_id: 平台标识
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: fixtures 依赖列表
|
||||||
|
"""
|
||||||
|
if platform_id in TEST_FIXTURE_DEPENDENCIES:
|
||||||
|
return TEST_FIXTURE_DEPENDENCIES[platform_id]
|
||||||
|
elif "default" in TEST_FIXTURE_DEPENDENCIES:
|
||||||
|
return TEST_FIXTURE_DEPENDENCIES["default"]
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def generate_uv_args(
|
||||||
|
dependencies: list,
|
||||||
|
test_path: str,
|
||||||
|
pytest_args: list,
|
||||||
|
python_version: str = None,
|
||||||
|
platform_id: str = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
生成 uv run 命令参数列表(用于 subprocess.run)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dependencies: 依赖包列表
|
||||||
|
test_path: 测试路径
|
||||||
|
pytest_args: 透传给 pytest 的参数
|
||||||
|
python_version: 需要的 python 版本,None 表示不指定
|
||||||
|
platform_id: 平台标识,用于选择 fixtures 依赖
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
uv run 命令参数列表
|
||||||
|
"""
|
||||||
|
args = ["uv", "run"]
|
||||||
|
|
||||||
|
if python_version:
|
||||||
|
args.extend(["--python", python_version])
|
||||||
|
|
||||||
|
# 添加 pytest
|
||||||
|
args.extend(["--with", "pytest"])
|
||||||
|
|
||||||
|
# 获取当前平台的 fixtures 依赖
|
||||||
|
fixture_deps = get_fixture_dependencies(platform_id) if platform_id else []
|
||||||
|
|
||||||
|
# 合并文件类型依赖和 fixtures 依赖,去重
|
||||||
|
all_deps = set()
|
||||||
|
for dep in dependencies:
|
||||||
|
all_deps.add(dep)
|
||||||
|
for dep in fixture_deps:
|
||||||
|
all_deps.add(dep)
|
||||||
|
|
||||||
|
# 添加所有依赖
|
||||||
|
for dep in sorted(all_deps):
|
||||||
|
args.extend(["--with", dep])
|
||||||
|
|
||||||
|
# 添加 pytest 命令
|
||||||
|
args.append("pytest")
|
||||||
|
|
||||||
|
# 添加测试路径
|
||||||
|
args.append(test_path)
|
||||||
|
|
||||||
|
# 添加透传的 pytest 参数
|
||||||
|
args.extend(pytest_args)
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数:解析参数并运行测试"""
|
||||||
|
# 解析命令行参数
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="自动根据测试类型加载依赖并运行 pytest",
|
||||||
|
usage="%(prog)s <test_type> [pytest_args...]",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"test_type",
|
||||||
|
choices=list(_TEST_TYPES.keys()),
|
||||||
|
help="测试类型: " + ", ".join(_TEST_TYPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"pytest_args",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
|
help="透传给 pytest 的参数(如 -v, --cov 等)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# 如果没有参数,显示帮助
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 特殊处理:如果第一个参数是帮助选项
|
||||||
|
if sys.argv[1] in ("-h", "--help"):
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# 使用 parse_known_args 来正确处理透传参数
|
||||||
|
# 因为 argparse.REMAINDER 会吃掉 --help,我们手动处理
|
||||||
|
test_type = sys.argv[1]
|
||||||
|
pytest_args = sys.argv[2:]
|
||||||
|
|
||||||
|
# 验证 test_type
|
||||||
|
if test_type not in _TEST_TYPES:
|
||||||
|
print(f"错误: 未知的测试类型 '{test_type}'")
|
||||||
|
print(f"可用类型: {', '.join(_TEST_TYPES.keys())}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 检测 uv 是否可用
|
||||||
|
uv_path = shutil.which("uv")
|
||||||
|
if not uv_path:
|
||||||
|
print("错误: 未找到 uv,请先安装 uv")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 获取测试配置
|
||||||
|
test_config = _TEST_TYPES[test_type]
|
||||||
|
test_path = test_config["path"]
|
||||||
|
|
||||||
|
# 导入需要的模块
|
||||||
|
from core.advice_generator import get_platform
|
||||||
|
|
||||||
|
# 获取平台和依赖配置
|
||||||
|
platform_id = get_platform()
|
||||||
|
python_version, dependencies = get_dependencies_for_type(test_type, platform_id)
|
||||||
|
|
||||||
|
# 生成 uv 命令参数
|
||||||
|
uv_args = generate_uv_args(
|
||||||
|
dependencies=dependencies,
|
||||||
|
test_path=test_path,
|
||||||
|
pytest_args=pytest_args,
|
||||||
|
python_version=python_version,
|
||||||
|
platform_id=platform_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 设置环境变量
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PYTHONPATH"] = str(project_root)
|
||||||
|
|
||||||
|
# 执行测试
|
||||||
|
result = subprocess.run(uv_args, env=env, cwd=str(project_root))
|
||||||
|
sys.exit(result.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
111
scripts/bootstrap.py
Normal file
111
scripts/bootstrap.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""文档解析器实际执行模块,承载业务逻辑。"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 将 scripts/ 目录添加到 sys.path,支持从任意位置执行脚本
|
||||||
|
scripts_dir = Path(__file__).resolve().parent
|
||||||
|
if str(scripts_dir) not in sys.path:
|
||||||
|
sys.path.append(str(scripts_dir))
|
||||||
|
|
||||||
|
# 抑制第三方库的进度条和日志,仅保留解析结果输出
|
||||||
|
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||||||
|
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||||
|
os.environ["TQDM_DISABLE"] = "1"
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
# 配置日志系统,只输出 ERROR 级别
|
||||||
|
logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
|
||||||
|
|
||||||
|
# 设置第三方库日志等级
|
||||||
|
logging.getLogger('docling').setLevel(logging.ERROR)
|
||||||
|
logging.getLogger('unstructured').setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
from core import (
|
||||||
|
FileDetectionError,
|
||||||
|
ReaderNotFoundError,
|
||||||
|
output_result,
|
||||||
|
parse_input,
|
||||||
|
process_content,
|
||||||
|
)
|
||||||
|
from readers import READERS
|
||||||
|
|
||||||
|
|
||||||
|
def run_normal(args) -> None:
|
||||||
|
"""正常执行模式:解析文件并输出结果"""
|
||||||
|
# 实例化所有 readers
|
||||||
|
readers = [ReaderCls() for ReaderCls in READERS]
|
||||||
|
|
||||||
|
try:
|
||||||
|
content, failures = parse_input(args.input_path, readers)
|
||||||
|
except FileDetectionError as e:
|
||||||
|
print(f"错误: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
except ReaderNotFoundError as e:
|
||||||
|
print(f"错误: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if content is None:
|
||||||
|
print("所有解析方法均失败:")
|
||||||
|
for failure in failures:
|
||||||
|
print(failure)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 处理内容
|
||||||
|
content = process_content(content)
|
||||||
|
|
||||||
|
# 输出结果
|
||||||
|
output_result(content, args)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""主函数:解析命令行参数并执行"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="将 DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL 解析为 Markdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("input_path", help="DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--context",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
||||||
|
)
|
||||||
|
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument(
|
||||||
|
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--titles",
|
||||||
|
action="store_true",
|
||||||
|
help="返回解析后的 markdown 文档的标题行(1-6级)",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"-tc",
|
||||||
|
"--title-content",
|
||||||
|
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--search",
|
||||||
|
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
run_normal(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -17,3 +17,190 @@ class Config:
|
|||||||
# 日志
|
# 日志
|
||||||
# 日志等级,默认只输出 ERROR 级别避免干扰 Markdown 输出
|
# 日志等级,默认只输出 ERROR 级别避免干扰 Markdown 输出
|
||||||
LOG_LEVEL = "ERROR"
|
LOG_LEVEL = "ERROR"
|
||||||
|
|
||||||
|
|
||||||
|
# 依赖配置:按文件类型和平台组织
|
||||||
|
# 每个平台配置包含 python 版本要求(None 表示使用默认)和依赖列表
|
||||||
|
DEPENDENCIES = {
|
||||||
|
"pdf": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.80.0",
|
||||||
|
"unstructured[pdf]==0.21.5",
|
||||||
|
"markitdown[pdf]==0.1.5",
|
||||||
|
"pypdf==6.9.0",
|
||||||
|
"markdownify==1.2.2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.40.0",
|
||||||
|
"docling-parse==4.0.0",
|
||||||
|
"numpy<2",
|
||||||
|
"markitdown[pdf]==0.1.5",
|
||||||
|
"pypdf==6.9.0",
|
||||||
|
"markdownify==1.2.2"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"docx": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.80.0",
|
||||||
|
"unstructured[docx]==0.21.5",
|
||||||
|
"markitdown[docx]==0.1.5",
|
||||||
|
"pypandoc-binary==1.17",
|
||||||
|
"python-docx==1.2.0",
|
||||||
|
"markdownify==1.2.2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.40.0",
|
||||||
|
"docling-parse==4.0.0",
|
||||||
|
"numpy<2",
|
||||||
|
"markitdown[docx]==0.1.5",
|
||||||
|
"pypandoc-binary==1.17",
|
||||||
|
"python-docx==1.2.0",
|
||||||
|
"markdownify==1.2.2"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"xlsx": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.80.0",
|
||||||
|
"unstructured[xlsx]==0.21.5",
|
||||||
|
"markitdown[xlsx]==0.1.5",
|
||||||
|
"pandas==3.0.1",
|
||||||
|
"tabulate==0.10.0",
|
||||||
|
"openpyxl==3.1.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.40.0",
|
||||||
|
"docling-parse==4.0.0",
|
||||||
|
"numpy<2",
|
||||||
|
"markitdown[xlsx]==0.1.5",
|
||||||
|
"pandas<3.0.0",
|
||||||
|
"tabulate==0.10.0",
|
||||||
|
"openpyxl==3.1.5"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pptx": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.80.0",
|
||||||
|
"unstructured[pptx]==0.21.5",
|
||||||
|
"markitdown[pptx]==0.1.5",
|
||||||
|
"python-pptx==1.0.2",
|
||||||
|
"markdownify==1.2.2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.40.0",
|
||||||
|
"docling-parse==4.0.0",
|
||||||
|
"numpy<2",
|
||||||
|
"markitdown[pptx]==0.1.5",
|
||||||
|
"python-pptx==1.0.2",
|
||||||
|
"markdownify==1.2.2"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"html": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"trafilatura==2.0.0",
|
||||||
|
"domscribe==0.1.3",
|
||||||
|
"markitdown==0.1.5",
|
||||||
|
"html2text==2025.4.15",
|
||||||
|
"beautifulsoup4==4.14.3",
|
||||||
|
"httpx==0.28.1",
|
||||||
|
"chardet==7.1.0",
|
||||||
|
"pyppeteer==2.0.0",
|
||||||
|
"selenium==4.25.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"trafilatura==2.0.0",
|
||||||
|
"domscribe==0.1.3",
|
||||||
|
"markitdown==0.1.5",
|
||||||
|
"html2text==2025.4.15",
|
||||||
|
"beautifulsoup4==4.14.3",
|
||||||
|
"httpx==0.28.1",
|
||||||
|
"chardet==7.1.0",
|
||||||
|
"pyppeteer==2.0.0",
|
||||||
|
"selenium==4.25.0"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"xls": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"unstructured[xlsx]==0.21.5",
|
||||||
|
"markitdown[xls]==0.1.5",
|
||||||
|
"pandas==3.0.1",
|
||||||
|
"tabulate==0.10.0",
|
||||||
|
"xlrd==2.0.2",
|
||||||
|
"olefile==0.47"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"markitdown[xls]==0.1.5",
|
||||||
|
"pandas<3.0.0",
|
||||||
|
"tabulate==0.10.0",
|
||||||
|
"xlrd==2.0.2",
|
||||||
|
"olefile==0.47",
|
||||||
|
"openpyxl==3.1.5"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"doc": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ppt": {
|
||||||
|
"default": {
|
||||||
|
"python": None,
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.80.0",
|
||||||
|
"unstructured[pptx]==0.21.5",
|
||||||
|
"markitdown[pptx]==0.1.5",
|
||||||
|
"python-pptx==1.0.2",
|
||||||
|
"markdownify==1.2.2",
|
||||||
|
"olefile==0.47"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Darwin-x86_64": {
|
||||||
|
"python": "3.12",
|
||||||
|
"dependencies": [
|
||||||
|
"docling==2.40.0",
|
||||||
|
"docling-parse==4.0.0",
|
||||||
|
"numpy<2",
|
||||||
|
"markitdown[pptx]==0.1.5",
|
||||||
|
"python-pptx==1.0.2",
|
||||||
|
"markdownify==1.2.2",
|
||||||
|
"olefile==0.47"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,20 +8,15 @@ from .exceptions import (
|
|||||||
DownloadError,
|
DownloadError,
|
||||||
)
|
)
|
||||||
from .markdown import (
|
from .markdown import (
|
||||||
parse_with_markitdown,
|
|
||||||
parse_with_docling,
|
|
||||||
build_markdown_table,
|
|
||||||
flush_list_stack,
|
|
||||||
safe_open_zip,
|
|
||||||
normalize_markdown_whitespace,
|
normalize_markdown_whitespace,
|
||||||
remove_markdown_images,
|
remove_markdown_images,
|
||||||
get_heading_level,
|
get_heading_level,
|
||||||
extract_titles,
|
extract_titles,
|
||||||
extract_title_content,
|
extract_title_content,
|
||||||
search_markdown,
|
search_markdown,
|
||||||
_unstructured_elements_to_markdown,
|
|
||||||
)
|
)
|
||||||
from .parser import parse_input, process_content, output_result
|
from .parser import parse_input, process_content, output_result
|
||||||
|
from .advice_generator import generate_advice
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"LyxyDocumentError",
|
"LyxyDocumentError",
|
||||||
@@ -29,19 +24,14 @@ __all__ = [
|
|||||||
"ReaderNotFoundError",
|
"ReaderNotFoundError",
|
||||||
"ParseError",
|
"ParseError",
|
||||||
"DownloadError",
|
"DownloadError",
|
||||||
"parse_with_markitdown",
|
|
||||||
"parse_with_docling",
|
|
||||||
"build_markdown_table",
|
|
||||||
"flush_list_stack",
|
|
||||||
"safe_open_zip",
|
|
||||||
"normalize_markdown_whitespace",
|
"normalize_markdown_whitespace",
|
||||||
"remove_markdown_images",
|
"remove_markdown_images",
|
||||||
"get_heading_level",
|
"get_heading_level",
|
||||||
"extract_titles",
|
"extract_titles",
|
||||||
"extract_title_content",
|
"extract_title_content",
|
||||||
"search_markdown",
|
"search_markdown",
|
||||||
"_unstructured_elements_to_markdown",
|
|
||||||
"parse_input",
|
"parse_input",
|
||||||
"process_content",
|
"process_content",
|
||||||
"output_result",
|
"output_result",
|
||||||
|
"generate_advice",
|
||||||
]
|
]
|
||||||
|
|||||||
300
scripts/core/advice_generator.py
Normal file
300
scripts/core/advice_generator.py
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
"""建议生成器模块,根据文件类型和平台返回执行建议。"""
|
||||||
|
|
||||||
|
import platform
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Optional, Tuple, List, Type
|
||||||
|
|
||||||
|
from config import DEPENDENCIES
|
||||||
|
from readers import BaseReader
|
||||||
|
from readers import (
|
||||||
|
PdfReader,
|
||||||
|
DocxReader,
|
||||||
|
XlsxReader,
|
||||||
|
PptxReader,
|
||||||
|
HtmlReader,
|
||||||
|
XlsReader,
|
||||||
|
DocReader,
|
||||||
|
PptReader,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Reader 类到配置 key 的映射
|
||||||
|
_READER_KEY_MAP: Dict[Type[BaseReader], str] = {
|
||||||
|
PdfReader: "pdf",
|
||||||
|
DocxReader: "docx",
|
||||||
|
XlsxReader: "xlsx",
|
||||||
|
PptxReader: "pptx",
|
||||||
|
HtmlReader: "html",
|
||||||
|
XlsReader: "xls",
|
||||||
|
DocReader: "doc",
|
||||||
|
PptReader: "ppt",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_file_type_light(input_path: str, readers: List[BaseReader]) -> Optional[Type[BaseReader]]:
|
||||||
|
"""
|
||||||
|
轻量文件类型检测,复用 Reader 的 supports 方法。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path: 文件路径或 URL
|
||||||
|
readers: 已实例化的 reader 列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
支持该输入的 Reader 类,无法识别返回 None
|
||||||
|
"""
|
||||||
|
for reader in readers:
|
||||||
|
if reader.supports(input_path):
|
||||||
|
return reader.__class__
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_platform() -> str:
|
||||||
|
"""
|
||||||
|
获取当前平台标识,格式为 {system}-{machine}。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
平台标识,例如 "Darwin-arm64"、"Linux-x86_64"、"Windows-AMD64"
|
||||||
|
"""
|
||||||
|
system = platform.system()
|
||||||
|
machine = platform.machine()
|
||||||
|
return f"{system}-{machine}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_dependencies(reader_cls: Type[BaseReader], platform_id: str) -> Tuple[Optional[str], list]:
|
||||||
|
"""
|
||||||
|
获取指定 Reader 类和平台的依赖配置。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reader_cls: Reader 类
|
||||||
|
platform_id: 平台标识(如 "Darwin-arm64")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(python_version, dependencies) 元组
|
||||||
|
- python_version: 需要的 python 版本,None 表示使用默认
|
||||||
|
- dependencies: 依赖包列表
|
||||||
|
"""
|
||||||
|
key = _READER_KEY_MAP.get(reader_cls)
|
||||||
|
if not key or key not in DEPENDENCIES:
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
type_config = DEPENDENCIES[key]
|
||||||
|
|
||||||
|
# 先尝试匹配特定平台
|
||||||
|
if platform_id in type_config:
|
||||||
|
config = type_config[platform_id]
|
||||||
|
return config.get("python"), config.get("dependencies", [])
|
||||||
|
|
||||||
|
# 使用 default 配置
|
||||||
|
if "default" in type_config:
|
||||||
|
config = type_config["default"]
|
||||||
|
return config.get("python"), config.get("dependencies", [])
|
||||||
|
|
||||||
|
return None, []
|
||||||
|
|
||||||
|
|
||||||
|
def generate_uv_command(
|
||||||
|
dependencies: list,
|
||||||
|
input_path: str,
|
||||||
|
python_version: Optional[str] = None,
|
||||||
|
script_path: str = "scripts/lyxy_document_reader.py",
|
||||||
|
include_pyarmor: bool = True
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
生成 uv run 命令。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dependencies: 依赖包列表
|
||||||
|
input_path: 输入文件路径或 URL
|
||||||
|
python_version: 需要的 python 版本,None 表示不指定
|
||||||
|
script_path: 脚本路径
|
||||||
|
include_pyarmor: 是否包含 pyarmor 依赖
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
uv run 命令字符串
|
||||||
|
"""
|
||||||
|
parts = ["PYTHONPATH=. uv run"]
|
||||||
|
|
||||||
|
if python_version:
|
||||||
|
parts.append(f"--python {python_version}")
|
||||||
|
|
||||||
|
if include_pyarmor:
|
||||||
|
parts.append("--with pyarmor")
|
||||||
|
|
||||||
|
for dep in dependencies:
|
||||||
|
# 处理包含空格的依赖(如 unstructured[pdf]),需要加引号
|
||||||
|
if "[" in dep or " " in dep:
|
||||||
|
parts.append(f'--with "{dep}"')
|
||||||
|
else:
|
||||||
|
parts.append(f"--with {dep}")
|
||||||
|
|
||||||
|
parts.append(f"{script_path} {input_path}")
|
||||||
|
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_uv_args(
|
||||||
|
dependencies: list,
|
||||||
|
script_path: str,
|
||||||
|
python_version: Optional[str] = None,
|
||||||
|
include_pyarmor: bool = True
|
||||||
|
) -> list:
|
||||||
|
"""
|
||||||
|
生成 uv run 命令参数列表(用于 subprocess.run)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dependencies: 依赖包列表
|
||||||
|
script_path: 脚本路径
|
||||||
|
python_version: 需要的 python 版本,None 表示不指定
|
||||||
|
include_pyarmor: 是否包含 pyarmor 依赖
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
uv run 命令参数列表
|
||||||
|
"""
|
||||||
|
args = ["uv", "run"]
|
||||||
|
|
||||||
|
if python_version:
|
||||||
|
args.extend(["--python", python_version])
|
||||||
|
|
||||||
|
if include_pyarmor:
|
||||||
|
args.extend(["--with", "pyarmor"])
|
||||||
|
|
||||||
|
for dep in dependencies:
|
||||||
|
args.extend(["--with", dep])
|
||||||
|
|
||||||
|
args.append(script_path)
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def generate_python_command(
|
||||||
|
dependencies: list,
|
||||||
|
input_path: str,
|
||||||
|
script_path: str = "scripts/lyxy_document_reader.py",
|
||||||
|
include_pyarmor: bool = True
|
||||||
|
) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
生成 python 命令和 pip 安装命令。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dependencies: 依赖包列表
|
||||||
|
input_path: 输入文件路径或 URL
|
||||||
|
script_path: 脚本路径
|
||||||
|
include_pyarmor: 是否包含 pyarmor 依赖
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(python_command, pip_command) 元组
|
||||||
|
"""
|
||||||
|
python_cmd = f"python {script_path} {input_path}"
|
||||||
|
|
||||||
|
# 构建 pip install 命令,处理带引号的依赖
|
||||||
|
pip_parts = ["pip install"]
|
||||||
|
if include_pyarmor:
|
||||||
|
pip_parts.append("pyarmor")
|
||||||
|
for dep in dependencies:
|
||||||
|
pip_parts.append(dep)
|
||||||
|
pip_cmd = " ".join(pip_parts)
|
||||||
|
|
||||||
|
return python_cmd, pip_cmd
|
||||||
|
|
||||||
|
|
||||||
|
def format_advice(
|
||||||
|
file_type: str,
|
||||||
|
input_path: str,
|
||||||
|
platform_id: str,
|
||||||
|
uv_command: str,
|
||||||
|
python_command: str,
|
||||||
|
pip_command: str,
|
||||||
|
has_platform_specific: bool = False
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
格式化建议输出。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_type: 文件类型
|
||||||
|
input_path: 输入路径
|
||||||
|
platform_id: 平台标识
|
||||||
|
uv_command: uv 命令
|
||||||
|
python_command: python 命令
|
||||||
|
pip_command: pip 安装命令
|
||||||
|
has_platform_specific: 是否使用了平台特殊配置
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的建议文本
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
# 文件类型和输入路径
|
||||||
|
lines.append(f"文件类型: {file_type.upper()}")
|
||||||
|
lines.append(f"输入路径: {input_path}")
|
||||||
|
|
||||||
|
# 平台信息(仅当使用了特殊配置时显示)
|
||||||
|
if has_platform_specific:
|
||||||
|
lines.append(f"平台: {platform_id}")
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# uv 命令
|
||||||
|
lines.append("[uv 命令]")
|
||||||
|
lines.append(uv_command)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# python 命令
|
||||||
|
lines.append("[python 命令]")
|
||||||
|
lines.append(python_command)
|
||||||
|
lines.append(pip_command)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_advice(
|
||||||
|
input_path: str,
|
||||||
|
readers: List[BaseReader],
|
||||||
|
script_path: str = "scripts/lyxy_document_reader.py"
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
生成完整的执行建议。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path: 输入文件路径或 URL
|
||||||
|
readers: 已实例化的 reader 列表
|
||||||
|
script_path: 脚本路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化的建议文本,无法识别文件类型返回 None
|
||||||
|
"""
|
||||||
|
# 检测文件类型,获取 Reader 类
|
||||||
|
reader_cls = detect_file_type_light(input_path, readers)
|
||||||
|
if not reader_cls:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 获取配置 key 和显示名称
|
||||||
|
key = _READER_KEY_MAP.get(reader_cls, "unknown")
|
||||||
|
file_type = key
|
||||||
|
|
||||||
|
# 获取平台
|
||||||
|
platform_id = get_platform()
|
||||||
|
|
||||||
|
# 获取依赖配置
|
||||||
|
python_version, dependencies = get_dependencies(reader_cls, platform_id)
|
||||||
|
|
||||||
|
# 判断是否使用了平台特殊配置
|
||||||
|
has_platform_specific = False
|
||||||
|
if key in DEPENDENCIES:
|
||||||
|
type_config = DEPENDENCIES[key]
|
||||||
|
if platform_id in type_config and "default" in type_config:
|
||||||
|
has_platform_specific = True
|
||||||
|
|
||||||
|
# 生成命令
|
||||||
|
uv_command = generate_uv_command(dependencies, input_path, python_version, script_path)
|
||||||
|
python_command, pip_command = generate_python_command(dependencies, input_path, script_path)
|
||||||
|
|
||||||
|
# 格式化输出
|
||||||
|
return format_advice(
|
||||||
|
file_type,
|
||||||
|
input_path,
|
||||||
|
platform_id,
|
||||||
|
uv_command,
|
||||||
|
python_command,
|
||||||
|
pip_command,
|
||||||
|
has_platform_specific
|
||||||
|
)
|
||||||
@@ -1,94 +1,11 @@
|
|||||||
"""Markdown 后处理模块,包含所有格式共享的工具函数。"""
|
"""Markdown 后处理模块,包含 Markdown 格式化的工具函数。"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import zipfile
|
from typing import List, Optional
|
||||||
from pathlib import Path
|
|
||||||
from typing import List, Optional, Tuple
|
|
||||||
|
|
||||||
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
||||||
_CONSECUTIVE_BLANK_LINES = re.compile(r"\n{3,}")
|
_CONSECUTIVE_BLANK_LINES = re.compile(r"\n{3,}")
|
||||||
|
|
||||||
# unstructured 噪声匹配: pptx 中的 RGB 颜色值(如 "R:255 G:128 B:0")
|
|
||||||
_RGB_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$")
|
|
||||||
# unstructured 噪声匹配: 破折号页码(如 "— 3 —")
|
|
||||||
_PAGE_NUMBER_PATTERN = re.compile(r"^—\s*\d+\s*—$")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_with_markitdown(
|
|
||||||
file_path: str,
|
|
||||||
) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 MarkItDown 库解析文件"""
|
|
||||||
try:
|
|
||||||
from markitdown import MarkItDown
|
|
||||||
|
|
||||||
md = MarkItDown()
|
|
||||||
result = md.convert(file_path)
|
|
||||||
if not result.text_content.strip():
|
|
||||||
return None, "文档为空"
|
|
||||||
return result.text_content, None
|
|
||||||
except ImportError:
|
|
||||||
return None, "MarkItDown 库未安装"
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def parse_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 docling 库解析文件"""
|
|
||||||
try:
|
|
||||||
from docling.document_converter import DocumentConverter
|
|
||||||
except ImportError:
|
|
||||||
return None, "docling 库未安装"
|
|
||||||
|
|
||||||
try:
|
|
||||||
converter = DocumentConverter()
|
|
||||||
result = converter.convert(file_path)
|
|
||||||
markdown_content = result.document.export_to_markdown()
|
|
||||||
if not markdown_content.strip():
|
|
||||||
return None, "文档为空"
|
|
||||||
return markdown_content, None
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"docling 解析失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def build_markdown_table(rows_data: List[List[str]]) -> str:
|
|
||||||
"""将二维列表转换为 Markdown 表格格式"""
|
|
||||||
if not rows_data or not rows_data[0]:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
md_lines = []
|
|
||||||
for i, row_data in enumerate(rows_data):
|
|
||||||
row_text = [cell if cell else "" for cell in row_data]
|
|
||||||
md_lines.append("| " + " | ".join(row_text) + " |")
|
|
||||||
if i == 0:
|
|
||||||
md_lines.append("| " + " | ".join(["---"] * len(row_text)) + " |")
|
|
||||||
return "\n".join(md_lines) + "\n\n"
|
|
||||||
|
|
||||||
|
|
||||||
def flush_list_stack(list_stack: List[str], target: List[str]) -> None:
|
|
||||||
"""将列表堆栈中的非空项添加到目标列表并清空堆栈"""
|
|
||||||
for item in list_stack:
|
|
||||||
if item:
|
|
||||||
target.append(item + "\n")
|
|
||||||
list_stack.clear()
|
|
||||||
|
|
||||||
|
|
||||||
def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipExtFile]:
|
|
||||||
"""安全地从 ZipFile 中打开文件,防止路径遍历攻击"""
|
|
||||||
if not name:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
normalized = Path(name).as_posix()
|
|
||||||
# 检查是否包含父目录引用
|
|
||||||
if ".." in Path(normalized).parts:
|
|
||||||
return None
|
|
||||||
# 检查是否为绝对路径
|
|
||||||
if Path(normalized).is_absolute():
|
|
||||||
return None
|
|
||||||
return zip_file.open(name)
|
|
||||||
except (ValueError, OSError):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_markdown_whitespace(content: str) -> str:
|
def normalize_markdown_whitespace(content: str) -> str:
|
||||||
"""规范化 Markdown 空白字符,保留单行空行"""
|
"""规范化 Markdown 空白字符,保留单行空行"""
|
||||||
@@ -235,56 +152,3 @@ def search_markdown(
|
|||||||
results.append("\n".join(result_lines))
|
results.append("\n".join(result_lines))
|
||||||
|
|
||||||
return "\n---\n".join(results)
|
return "\n---\n".join(results)
|
||||||
|
|
||||||
|
|
||||||
def _unstructured_elements_to_markdown(
|
|
||||||
elements: list, trust_titles: bool = True
|
|
||||||
) -> str:
|
|
||||||
"""将 unstructured 解析出的元素列表转换为 Markdown 文本"""
|
|
||||||
try:
|
|
||||||
import markdownify as md_lib
|
|
||||||
from unstructured.documents.elements import (
|
|
||||||
Footer,
|
|
||||||
Header,
|
|
||||||
Image,
|
|
||||||
ListItem,
|
|
||||||
PageBreak,
|
|
||||||
PageNumber,
|
|
||||||
Table,
|
|
||||||
Title,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
return "\n\n".join(
|
|
||||||
el.text for el in elements if hasattr(el, "text") and el.text and el.text.strip()
|
|
||||||
)
|
|
||||||
|
|
||||||
skip_types = (Header, Footer, PageBreak, PageNumber)
|
|
||||||
parts = []
|
|
||||||
|
|
||||||
for el in elements:
|
|
||||||
if isinstance(el, skip_types):
|
|
||||||
continue
|
|
||||||
text = el.text.strip() if hasattr(el, "text") else str(el).strip()
|
|
||||||
if not text or _RGB_PATTERN.match(text) or _PAGE_NUMBER_PATTERN.match(text):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if isinstance(el, Table):
|
|
||||||
html = getattr(el.metadata, "text_as_html", None)
|
|
||||||
if html:
|
|
||||||
parts.append(md_lib.markdownify(html, strip=["img"]).strip())
|
|
||||||
else:
|
|
||||||
parts.append(str(el))
|
|
||||||
elif isinstance(el, Title) and trust_titles:
|
|
||||||
depth = getattr(el.metadata, "category_depth", None) or 1
|
|
||||||
depth = min(max(depth, 1), 4)
|
|
||||||
parts.append(f"{'#' * depth} {text}")
|
|
||||||
elif isinstance(el, ListItem):
|
|
||||||
parts.append(f"- {text}")
|
|
||||||
elif isinstance(el, Image):
|
|
||||||
path = getattr(el.metadata, "image_path", None) or ""
|
|
||||||
if path:
|
|
||||||
parts.append(f"")
|
|
||||||
else:
|
|
||||||
parts.append(text)
|
|
||||||
|
|
||||||
return "\n\n".join(parts)
|
|
||||||
|
|||||||
@@ -4,12 +4,12 @@ import argparse
|
|||||||
import sys
|
import sys
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.core.exceptions import FileDetectionError, ReaderNotFoundError
|
from core.exceptions import FileDetectionError, ReaderNotFoundError
|
||||||
from scripts.core.markdown import (
|
from core.markdown import (
|
||||||
normalize_markdown_whitespace,
|
normalize_markdown_whitespace,
|
||||||
remove_markdown_images,
|
remove_markdown_images,
|
||||||
)
|
)
|
||||||
from scripts.readers import BaseReader
|
from readers import BaseReader
|
||||||
|
|
||||||
|
|
||||||
def parse_input(
|
def parse_input(
|
||||||
|
|||||||
@@ -1,42 +1,36 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""文档解析器命令行交互模块,提供命令行接口。支持 DOCX、PPTX、XLSX、PDF、HTML 和 URL。"""
|
"""文档解析器入口 - 环境检测和自启动"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
from pathlib import Path
|
||||||
|
|
||||||
# 抑制第三方库的进度条和日志,仅保留解析结果输出
|
# 确定项目根目录和脚本路径
|
||||||
|
script_file = Path(__file__).resolve()
|
||||||
|
scripts_dir = script_file.parent
|
||||||
|
project_root = scripts_dir.parent
|
||||||
|
bootstrap_path = str(scripts_dir / "bootstrap.py")
|
||||||
|
|
||||||
|
# 将 scripts/ 目录添加到 sys.path
|
||||||
|
if str(scripts_dir) not in sys.path:
|
||||||
|
sys.path.append(str(scripts_dir))
|
||||||
|
|
||||||
|
# 抑制第三方库日志
|
||||||
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||||||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||||
os.environ["TQDM_DISABLE"] = "1"
|
os.environ["TQDM_DISABLE"] = "1"
|
||||||
warnings.filterwarnings("ignore")
|
|
||||||
|
|
||||||
# 配置日志系统,只输出 ERROR 级别
|
|
||||||
logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
|
|
||||||
|
|
||||||
# 设置第三方库日志等级
|
|
||||||
logging.getLogger('docling').setLevel(logging.ERROR)
|
|
||||||
logging.getLogger('unstructured').setLevel(logging.ERROR)
|
|
||||||
|
|
||||||
from scripts.core import (
|
|
||||||
FileDetectionError,
|
|
||||||
ReaderNotFoundError,
|
|
||||||
output_result,
|
|
||||||
parse_input,
|
|
||||||
process_content,
|
|
||||||
)
|
|
||||||
from scripts.readers import READERS
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main():
|
||||||
|
"""主函数:环境检测和决策"""
|
||||||
|
# 解析命令行参数(轻量,仅识别必要参数)
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="将 DOCX、PPTX、XLSX、PDF、HTML 文件或 URL 解析为 Markdown"
|
description="将 DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL 解析为 Markdown"
|
||||||
)
|
)
|
||||||
|
parser.add_argument("input_path", help="DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL")
|
||||||
parser.add_argument("input_path", help="DOCX、PPTX、XLSX、PDF、HTML 文件或 URL")
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-n",
|
"-n",
|
||||||
"--context",
|
"--context",
|
||||||
@@ -44,7 +38,6 @@ def main() -> None:
|
|||||||
default=2,
|
default=2,
|
||||||
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
||||||
)
|
)
|
||||||
|
|
||||||
group = parser.add_mutually_exclusive_group()
|
group = parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
||||||
@@ -71,29 +64,58 @@ def main() -> None:
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# 实例化所有 readers
|
# 检测 uv 是否可用
|
||||||
|
uv_path = shutil.which("uv")
|
||||||
|
|
||||||
|
if not uv_path:
|
||||||
|
# uv 不可用,降级为直接执行 bootstrap.py
|
||||||
|
import bootstrap
|
||||||
|
bootstrap.run_normal(args)
|
||||||
|
return
|
||||||
|
|
||||||
|
# uv 可用,需要自启动
|
||||||
|
# 导入依赖检测模块
|
||||||
|
from config import DEPENDENCIES
|
||||||
|
from core.advice_generator import (
|
||||||
|
detect_file_type_light,
|
||||||
|
get_platform,
|
||||||
|
get_dependencies,
|
||||||
|
generate_uv_args,
|
||||||
|
)
|
||||||
|
from readers import READERS
|
||||||
|
|
||||||
|
# 检测文件类型
|
||||||
readers = [ReaderCls() for ReaderCls in READERS]
|
readers = [ReaderCls() for ReaderCls in READERS]
|
||||||
|
reader_cls = detect_file_type_light(args.input_path, readers)
|
||||||
|
|
||||||
try:
|
if not reader_cls:
|
||||||
content, failures = parse_input(args.input_path, readers)
|
# 无法识别文件类型,降级执行让它报错
|
||||||
except FileDetectionError as e:
|
import bootstrap
|
||||||
print(f"错误: {e}")
|
bootstrap.run_normal(args)
|
||||||
sys.exit(1)
|
return
|
||||||
except ReaderNotFoundError as e:
|
|
||||||
print(f"错误: {e}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if content is None:
|
# 获取平台和依赖配置
|
||||||
print("所有解析方法均失败:")
|
platform_id = get_platform()
|
||||||
for failure in failures:
|
python_version, dependencies = get_dependencies(reader_cls, platform_id)
|
||||||
print(failure)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# 处理内容
|
# 生成 uv 命令参数列表
|
||||||
content = process_content(content)
|
uv_args = generate_uv_args(
|
||||||
|
dependencies=dependencies,
|
||||||
|
script_path=bootstrap_path,
|
||||||
|
python_version=python_version,
|
||||||
|
include_pyarmor=True
|
||||||
|
)
|
||||||
|
|
||||||
# 输出结果
|
# 添加所有命令行参数
|
||||||
output_result(content, args)
|
uv_args.extend(sys.argv[1:])
|
||||||
|
|
||||||
|
# 设置环境变量
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PYTHONPATH"] = str(project_root)
|
||||||
|
|
||||||
|
# 自启动:使用 subprocess 替代 execvpe(Windows 兼容)
|
||||||
|
result = subprocess.run(uv_args, env=env, cwd=str(project_root))
|
||||||
|
sys.exit(result.returncode)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -2,25 +2,34 @@
|
|||||||
|
|
||||||
from .base import BaseReader
|
from .base import BaseReader
|
||||||
from .docx import DocxReader
|
from .docx import DocxReader
|
||||||
|
from .doc import DocReader
|
||||||
from .xlsx import XlsxReader
|
from .xlsx import XlsxReader
|
||||||
from .pptx import PptxReader
|
from .pptx import PptxReader
|
||||||
from .pdf import PdfReader
|
from .pdf import PdfReader
|
||||||
from .html import HtmlReader
|
from .html import HtmlReader
|
||||||
|
from .xls import XlsReader
|
||||||
|
from .ppt import PptReader
|
||||||
|
|
||||||
READERS = [
|
READERS = [
|
||||||
DocxReader,
|
DocxReader,
|
||||||
|
DocReader,
|
||||||
XlsxReader,
|
XlsxReader,
|
||||||
PptxReader,
|
PptxReader,
|
||||||
PdfReader,
|
PdfReader,
|
||||||
HtmlReader,
|
HtmlReader,
|
||||||
|
XlsReader,
|
||||||
|
PptReader,
|
||||||
]
|
]
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"BaseReader",
|
"BaseReader",
|
||||||
"DocxReader",
|
"DocxReader",
|
||||||
|
"DocReader",
|
||||||
"XlsxReader",
|
"XlsxReader",
|
||||||
"PptxReader",
|
"PptxReader",
|
||||||
"PdfReader",
|
"PdfReader",
|
||||||
"HtmlReader",
|
"HtmlReader",
|
||||||
|
"XlsReader",
|
||||||
|
"PptReader",
|
||||||
"READERS",
|
"READERS",
|
||||||
]
|
]
|
||||||
|
|||||||
310
scripts/readers/_utils.py
Normal file
310
scripts/readers/_utils.py
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
"""Reader 内部共享工具模块。
|
||||||
|
|
||||||
|
此模块包含各 reader 实现共享的内部工具函数,仅供 readers 包内部使用。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# 通用解析器包装函数
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def parse_via_markitdown(
|
||||||
|
file_path: str,
|
||||||
|
) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 MarkItDown 库解析文件。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(markdown_content, error_message): 成功时 (content, None),失败时 (None, error)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
|
md = MarkItDown()
|
||||||
|
result = md.convert(file_path)
|
||||||
|
if not result.text_content.strip():
|
||||||
|
return None, "文档为空"
|
||||||
|
return result.text_content, None
|
||||||
|
except ImportError:
|
||||||
|
return None, "MarkItDown 库未安装"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_via_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 docling 库解析文件。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(markdown_content, error_message): 成功时 (content, None),失败时 (None, error)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
except ImportError:
|
||||||
|
return None, "docling 库未安装"
|
||||||
|
|
||||||
|
try:
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
|
markdown_content = result.document.export_to_markdown()
|
||||||
|
if not markdown_content.strip():
|
||||||
|
return None, "文档为空"
|
||||||
|
return markdown_content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"docling 解析失败: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def convert_via_libreoffice(
|
||||||
|
input_path: str,
|
||||||
|
target_format: str,
|
||||||
|
output_dir: Path,
|
||||||
|
output_suffix: Optional[str] = None,
|
||||||
|
timeout: int = 60
|
||||||
|
) -> Tuple[Optional[Path], Optional[str]]:
|
||||||
|
"""使用 LibreOffice soffice 命令行转换文件格式。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path: 输入文件路径
|
||||||
|
target_format: 目标格式(如 "md", "pptx")
|
||||||
|
output_dir: 输出目录(调用者负责生命周期管理)
|
||||||
|
output_suffix: 可选,输出文件后缀(不指定则使用 target_format)
|
||||||
|
timeout: 超时时间(秒)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(output_path, error_message): 成功时 (Path, None),失败时 (None, error)
|
||||||
|
"""
|
||||||
|
# 检测 soffice 是否在 PATH 中
|
||||||
|
soffice_path = shutil.which("soffice")
|
||||||
|
if not soffice_path:
|
||||||
|
return None, "LibreOffice 未安装"
|
||||||
|
|
||||||
|
input_file = Path(input_path)
|
||||||
|
suffix = output_suffix if output_suffix else target_format
|
||||||
|
expected_output = output_dir / (input_file.stem + "." + suffix)
|
||||||
|
|
||||||
|
# 构建命令
|
||||||
|
cmd = [
|
||||||
|
soffice_path,
|
||||||
|
"--headless",
|
||||||
|
"--convert-to", target_format,
|
||||||
|
"--outdir", str(output_dir),
|
||||||
|
str(input_file)
|
||||||
|
]
|
||||||
|
|
||||||
|
# 执行命令
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return None, f"LibreOffice 转换超时 ({timeout}秒)"
|
||||||
|
|
||||||
|
# 检查返回码
|
||||||
|
if result.returncode != 0:
|
||||||
|
return None, f"LibreOffice 转换失败 (code: {result.returncode})"
|
||||||
|
|
||||||
|
# 检查输出文件是否存在
|
||||||
|
output_file = None
|
||||||
|
if expected_output.exists():
|
||||||
|
output_file = expected_output
|
||||||
|
else:
|
||||||
|
# Fallback: 遍历目录找任意匹配后缀的文件
|
||||||
|
pattern = "*." + suffix
|
||||||
|
files = list(output_dir.glob(pattern))
|
||||||
|
if files:
|
||||||
|
output_file = files[0]
|
||||||
|
|
||||||
|
if not output_file:
|
||||||
|
return None, "LibreOffice 未生成输出文件"
|
||||||
|
|
||||||
|
return output_file, None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_via_libreoffice(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 LibreOffice soffice 命令行转换文件为 Markdown。
|
||||||
|
|
||||||
|
支持 .doc/.docx/.odt 等 LibreOffice 可处理的格式。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(markdown_content, error_message): 成功时 (content, None),失败时 (None, error)
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
output_path, error = convert_via_libreoffice(
|
||||||
|
input_path=file_path,
|
||||||
|
target_format="md",
|
||||||
|
output_dir=Path(temp_dir),
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
if error:
|
||||||
|
return None, error
|
||||||
|
|
||||||
|
# 读取输出内容
|
||||||
|
content = output_path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
content = content.strip()
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
return None, "LibreOffice 输出为空"
|
||||||
|
|
||||||
|
return content, None
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# 格式化工具
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def build_markdown_table(rows_data: List[List[str]]) -> str:
|
||||||
|
"""将二维列表格式化为 Markdown 表格。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rows_data: 二维列表,第一行为表头
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown 格式的表格字符串
|
||||||
|
"""
|
||||||
|
if not rows_data or not rows_data[0]:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
md_lines = []
|
||||||
|
for i, row_data in enumerate(rows_data):
|
||||||
|
row_text = [cell if cell else "" for cell in row_data]
|
||||||
|
md_lines.append("| " + " | ".join(row_text) + " |")
|
||||||
|
if i == 0:
|
||||||
|
md_lines.append("| " + " | ".join(["---"] * len(row_text)) + " |")
|
||||||
|
return "\n".join(md_lines) + "\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# 列表处理工具
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def flush_list_stack(list_stack: List[str], target: List[str]) -> None:
|
||||||
|
"""将列表堆栈中的非空项添加到目标列表并清空堆栈。
|
||||||
|
|
||||||
|
用于处理嵌套列表的格式化输出。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
list_stack: 列表堆栈
|
||||||
|
target: 目标列表
|
||||||
|
"""
|
||||||
|
for item in list_stack:
|
||||||
|
if item:
|
||||||
|
target.append(item + "\n")
|
||||||
|
list_stack.clear()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# ZIP 文件安全处理
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipExtFile]:
|
||||||
|
"""安全地从 ZipFile 中打开文件,防止路径遍历攻击。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zip_file: ZipFile 对象
|
||||||
|
name: 文件名
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ZipExtFile 对象,如果路径不安全则返回 None
|
||||||
|
"""
|
||||||
|
if not name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
normalized = Path(name).as_posix()
|
||||||
|
# 检查是否包含父目录引用
|
||||||
|
if ".." in Path(normalized).parts:
|
||||||
|
return None
|
||||||
|
# 检查是否为绝对路径
|
||||||
|
if Path(normalized).is_absolute():
|
||||||
|
return None
|
||||||
|
return zip_file.open(name)
|
||||||
|
except (ValueError, OSError, KeyError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# unstructured 库相关
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# unstructured 噪声匹配模式
|
||||||
|
_UNSTRUCTURED_RGB_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$")
|
||||||
|
_UNSTRUCTURED_PAGE_NUMBER_PATTERN = re.compile(r"^—\s*\d+\s*—$")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_unstructured_to_markdown(
|
||||||
|
elements: list, trust_titles: bool = True
|
||||||
|
) -> str:
|
||||||
|
"""将 unstructured 解析出的元素列表转换为 Markdown 文本。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elements: unstructured 解析的元素列表
|
||||||
|
trust_titles: 是否信任 unstructured 的标题检测
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown 格式的文本
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import markdownify as md_lib
|
||||||
|
from unstructured.documents.elements import (
|
||||||
|
Footer,
|
||||||
|
Header,
|
||||||
|
Image,
|
||||||
|
ListItem,
|
||||||
|
PageBreak,
|
||||||
|
PageNumber,
|
||||||
|
Table,
|
||||||
|
Title,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
return "\n\n".join(
|
||||||
|
el.text for el in elements if hasattr(el, "text") and el.text and el.text.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
skip_types = (Header, Footer, PageBreak, PageNumber)
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
for el in elements:
|
||||||
|
if isinstance(el, skip_types):
|
||||||
|
continue
|
||||||
|
text = el.text.strip() if hasattr(el, "text") else str(el).strip()
|
||||||
|
if not text or _UNSTRUCTURED_RGB_PATTERN.match(text) or _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match(text):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(el, Table):
|
||||||
|
html = getattr(el.metadata, "text_as_html", None)
|
||||||
|
if html:
|
||||||
|
parts.append(md_lib.markdownify(html, strip=["img"]).strip())
|
||||||
|
else:
|
||||||
|
parts.append(str(el))
|
||||||
|
elif isinstance(el, Title) and trust_titles:
|
||||||
|
depth = getattr(el.metadata, "category_depth", None) or 1
|
||||||
|
depth = min(max(depth, 1), 4)
|
||||||
|
parts.append(f"{'#' * depth} {text}")
|
||||||
|
elif isinstance(el, ListItem):
|
||||||
|
parts.append(f"- {text}")
|
||||||
|
elif isinstance(el, Image):
|
||||||
|
path = getattr(el.metadata, "image_path", None) or ""
|
||||||
|
if path:
|
||||||
|
parts.append(f"")
|
||||||
|
else:
|
||||||
|
parts.append(text)
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
46
scripts/readers/doc/__init__.py
Normal file
46
scripts/readers/doc/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""DOC 文件阅读器,使用 LibreOffice 解析。"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
from readers.base import BaseReader
|
||||||
|
from utils import is_valid_doc
|
||||||
|
|
||||||
|
from . import libreoffice
|
||||||
|
|
||||||
|
|
||||||
|
PARSERS = [
|
||||||
|
("LibreOffice", libreoffice.parse),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class DocReader(BaseReader):
|
||||||
|
"""DOC 文件阅读器"""
|
||||||
|
|
||||||
|
def supports(self, file_path: str) -> bool:
|
||||||
|
return file_path.lower().endswith('.doc')
|
||||||
|
|
||||||
|
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
|
||||||
|
failures = []
|
||||||
|
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
return None, ["文件不存在"]
|
||||||
|
|
||||||
|
# 验证文件格式
|
||||||
|
if not is_valid_doc(file_path):
|
||||||
|
return None, ["不是有效的 DOC 文件"]
|
||||||
|
|
||||||
|
content = None
|
||||||
|
|
||||||
|
for parser_name, parser_func in PARSERS:
|
||||||
|
try:
|
||||||
|
content, error = parser_func(file_path)
|
||||||
|
if content is not None:
|
||||||
|
return content, failures
|
||||||
|
else:
|
||||||
|
failures.append(f"- {parser_name}: {error}")
|
||||||
|
except Exception as e:
|
||||||
|
failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
|
return None, failures
|
||||||
9
scripts/readers/doc/libreoffice.py
Normal file
9
scripts/readers/doc/libreoffice.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
"""使用 LibreOffice soffice 命令行解析 DOC 文件"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
from readers._utils import parse_via_libreoffice
|
||||||
|
|
||||||
|
|
||||||
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 LibreOffice soffice 解析 DOC 文件"""
|
||||||
|
return parse_via_libreoffice(file_path)
|
||||||
@@ -3,13 +3,14 @@
|
|||||||
import os
|
import os
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.readers.base import BaseReader
|
from readers.base import BaseReader
|
||||||
from scripts.utils import is_valid_docx
|
from utils import is_valid_docx
|
||||||
|
|
||||||
from . import docling
|
from . import docling
|
||||||
from . import unstructured
|
from . import unstructured
|
||||||
from . import markitdown
|
from . import markitdown
|
||||||
from . import pypandoc
|
from . import pypandoc
|
||||||
|
from . import libreoffice
|
||||||
from . import python_docx
|
from . import python_docx
|
||||||
from . import native_xml
|
from . import native_xml
|
||||||
|
|
||||||
@@ -19,6 +20,7 @@ PARSERS = [
|
|||||||
("unstructured", unstructured.parse),
|
("unstructured", unstructured.parse),
|
||||||
("pypandoc-binary", pypandoc.parse),
|
("pypandoc-binary", pypandoc.parse),
|
||||||
("MarkItDown", markitdown.parse),
|
("MarkItDown", markitdown.parse),
|
||||||
|
("LibreOffice", libreoffice.parse),
|
||||||
("python-docx", python_docx.parse),
|
("python-docx", python_docx.parse),
|
||||||
("XML 原生解析", native_xml.parse),
|
("XML 原生解析", native_xml.parse),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_docling
|
from readers._utils import parse_via_docling
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 docling 库解析 DOCX 文件"""
|
"""使用 docling 库解析 DOCX 文件"""
|
||||||
return parse_with_docling(file_path)
|
return parse_via_docling(file_path)
|
||||||
|
|||||||
9
scripts/readers/docx/libreoffice.py
Normal file
9
scripts/readers/docx/libreoffice.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
"""使用 LibreOffice soffice 命令行解析 DOCX 文件"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
from readers._utils import parse_via_libreoffice
|
||||||
|
|
||||||
|
|
||||||
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 LibreOffice soffice 解析 DOCX 文件"""
|
||||||
|
return parse_via_libreoffice(file_path)
|
||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_markitdown
|
from readers._utils import parse_via_markitdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 MarkItDown 库解析 DOCX 文件"""
|
"""使用 MarkItDown 库解析 DOCX 文件"""
|
||||||
return parse_with_markitdown(file_path)
|
return parse_via_markitdown(file_path)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
|
|||||||
import zipfile
|
import zipfile
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import build_markdown_table, safe_open_zip
|
from readers._utils import build_markdown_table, safe_open_zip
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Any, List, Optional, Tuple
|
from typing import Any, List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import build_markdown_table
|
from readers._utils import build_markdown_table
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import _unstructured_elements_to_markdown
|
from readers._utils import convert_unstructured_to_markdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
@@ -14,7 +14,7 @@ def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
elements = partition_docx(filename=file_path, infer_table_structure=True)
|
elements = partition_docx(filename=file_path, infer_table_structure=True)
|
||||||
content = _unstructured_elements_to_markdown(elements)
|
content = convert_unstructured_to_markdown(elements)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
return None, "文档为空"
|
return None, "文档为空"
|
||||||
return content, None
|
return content, None
|
||||||
|
|||||||
@@ -4,12 +4,12 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.readers.base import BaseReader
|
from readers.base import BaseReader
|
||||||
from scripts.utils import is_url
|
from utils import is_url
|
||||||
from scripts.utils import encoding_detection
|
from utils import encoding_detection
|
||||||
|
|
||||||
from . import cleaner
|
from . import cleaner
|
||||||
from . import downloader
|
from .downloader import download_html
|
||||||
from . import trafilatura
|
from . import trafilatura
|
||||||
from . import domscribe
|
from . import domscribe
|
||||||
from . import markitdown
|
from . import markitdown
|
||||||
@@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
|
|||||||
# 步骤 1: 获取 HTML 内容
|
# 步骤 1: 获取 HTML 内容
|
||||||
if is_url(file_path):
|
if is_url(file_path):
|
||||||
# URL 路径: 下载 HTML
|
# URL 路径: 下载 HTML
|
||||||
html_content, download_failures = downloader.download_html(file_path)
|
html_content, download_failures = download_html(file_path)
|
||||||
all_failures.extend(download_failures)
|
all_failures.extend(download_failures)
|
||||||
if html_content is None:
|
if html_content is None:
|
||||||
return None, all_failures
|
return None, all_failures
|
||||||
|
|||||||
@@ -3,8 +3,6 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,262 +0,0 @@
|
|||||||
"""URL 下载模块,按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import asyncio
|
|
||||||
import tempfile
|
|
||||||
import urllib.request
|
|
||||||
import urllib.error
|
|
||||||
from typing import Optional, Tuple
|
|
||||||
|
|
||||||
|
|
||||||
# 公共配置
|
|
||||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
||||||
WINDOW_SIZE = "1920,1080"
|
|
||||||
LANGUAGE_SETTING = "zh-CN,zh"
|
|
||||||
|
|
||||||
# Chrome 浏览器启动参数(pyppeteer 和 selenium 共用)
|
|
||||||
CHROME_ARGS = [
|
|
||||||
"--no-sandbox",
|
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-gpu",
|
|
||||||
"--disable-software-rasterizer",
|
|
||||||
"--disable-extensions",
|
|
||||||
"--disable-background-networking",
|
|
||||||
"--disable-default-apps",
|
|
||||||
"--disable-sync",
|
|
||||||
"--disable-translate",
|
|
||||||
"--hide-scrollbars",
|
|
||||||
"--metrics-recording-only",
|
|
||||||
"--mute-audio",
|
|
||||||
"--no-first-run",
|
|
||||||
"--safebrowsing-disable-auto-update",
|
|
||||||
"--blink-settings=imagesEnabled=false",
|
|
||||||
"--disable-plugins",
|
|
||||||
"--disable-ipc-flooding-protection",
|
|
||||||
"--disable-renderer-backgrounding",
|
|
||||||
"--disable-background-timer-throttling",
|
|
||||||
"--disable-hang-monitor",
|
|
||||||
"--disable-prompt-on-repost",
|
|
||||||
"--disable-client-side-phishing-detection",
|
|
||||||
"--disable-component-update",
|
|
||||||
"--disable-domain-reliability",
|
|
||||||
"--disable-features=site-per-process",
|
|
||||||
"--disable-features=IsolateOrigins",
|
|
||||||
"--disable-features=VizDisplayCompositor",
|
|
||||||
"--disable-features=WebRTC",
|
|
||||||
f"--window-size={WINDOW_SIZE}",
|
|
||||||
f"--lang={LANGUAGE_SETTING}",
|
|
||||||
f"--user-agent={USER_AGENT}",
|
|
||||||
]
|
|
||||||
|
|
||||||
# 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用)
|
|
||||||
HIDE_AUTOMATION_SCRIPT = """
|
|
||||||
() => {
|
|
||||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
||||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
||||||
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理)
|
|
||||||
HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
|
|
||||||
() => {
|
|
||||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
||||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
||||||
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
|
||||||
const originalQuery = window.navigator.permissions.query;
|
|
||||||
window.navigator.permissions.query = (parameters) => (
|
|
||||||
parameters.name === 'notifications' ?
|
|
||||||
Promise.resolve({ state: Notification.permission }) :
|
|
||||||
originalQuery(parameters)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 pyppeteer 下载 URL(支持 JS 渲染)"""
|
|
||||||
try:
|
|
||||||
from pyppeteer import launch
|
|
||||||
except ImportError:
|
|
||||||
return None, "pyppeteer 库未安装"
|
|
||||||
|
|
||||||
async def _download():
|
|
||||||
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
|
|
||||||
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
|
||||||
if not chromium_path:
|
|
||||||
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
|
|
||||||
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
|
|
||||||
|
|
||||||
browser = None
|
|
||||||
try:
|
|
||||||
browser = await launch(
|
|
||||||
headless=True,
|
|
||||||
executablePath=executable_path,
|
|
||||||
args=CHROME_ARGS
|
|
||||||
)
|
|
||||||
page = await browser.newPage()
|
|
||||||
|
|
||||||
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
|
|
||||||
|
|
||||||
await page.setJavaScriptEnabled(True)
|
|
||||||
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
|
|
||||||
return await page.content()
|
|
||||||
finally:
|
|
||||||
if browser is not None:
|
|
||||||
try:
|
|
||||||
await browser.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
content = asyncio.run(_download())
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"pyppeteer 下载失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 selenium 下载 URL(支持 JS 渲染)"""
|
|
||||||
try:
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
except ImportError:
|
|
||||||
return None, "selenium 库未安装"
|
|
||||||
|
|
||||||
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
|
|
||||||
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
|
||||||
|
|
||||||
if not driver_path or not os.path.exists(driver_path):
|
|
||||||
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
|
|
||||||
if not binary_path or not os.path.exists(binary_path):
|
|
||||||
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
|
|
||||||
|
|
||||||
chrome_options = Options()
|
|
||||||
chrome_options.binary_location = binary_path
|
|
||||||
chrome_options.add_argument("--headless=new")
|
|
||||||
for arg in CHROME_ARGS:
|
|
||||||
chrome_options.add_argument(arg)
|
|
||||||
|
|
||||||
# 隐藏自动化特征
|
|
||||||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
||||||
chrome_options.add_experimental_option("useAutomationExtension", False)
|
|
||||||
|
|
||||||
driver = None
|
|
||||||
try:
|
|
||||||
import time
|
|
||||||
service = Service(driver_path)
|
|
||||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
|
||||||
|
|
||||||
# 隐藏 webdriver 属性
|
|
||||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
|
||||||
"source": HIDE_AUTOMATION_SCRIPT
|
|
||||||
})
|
|
||||||
|
|
||||||
driver.get(url)
|
|
||||||
|
|
||||||
# 等待页面内容稳定
|
|
||||||
WebDriverWait(driver, 30).until(
|
|
||||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
|
||||||
)
|
|
||||||
|
|
||||||
last_len = 0
|
|
||||||
stable_count = 0
|
|
||||||
for _ in range(30):
|
|
||||||
current_len = len(driver.page_source)
|
|
||||||
if current_len == last_len:
|
|
||||||
stable_count += 1
|
|
||||||
if stable_count >= 2:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
stable_count = 0
|
|
||||||
last_len = current_len
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
content = driver.page_source
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"selenium 下载失败: {str(e)}"
|
|
||||||
finally:
|
|
||||||
if driver is not None:
|
|
||||||
try:
|
|
||||||
driver.quit()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 httpx 下载 URL(轻量级 HTTP 客户端)"""
|
|
||||||
try:
|
|
||||||
import httpx
|
|
||||||
except ImportError:
|
|
||||||
return None, "httpx 库未安装"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"User-Agent": USER_AGENT
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
with httpx.Client(timeout=30.0) as client:
|
|
||||||
response = client.get(url, headers=headers)
|
|
||||||
if response.status_code == 200:
|
|
||||||
content = response.text
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
return None, f"HTTP {response.status_code}"
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"httpx 下载失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 urllib 下载 URL(标准库,兜底方案)"""
|
|
||||||
headers = {
|
|
||||||
"User-Agent": USER_AGENT
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(url, headers=headers)
|
|
||||||
with urllib.request.urlopen(req, timeout=30) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
content = response.read().decode("utf-8")
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
return None, f"HTTP {response.status}"
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"urllib 下载失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def download_html(url: str) -> Tuple[Optional[str], list]:
|
|
||||||
"""
|
|
||||||
统一的 HTML 下载入口函数,按优先级尝试各下载器。
|
|
||||||
|
|
||||||
返回: (content, failures)
|
|
||||||
- content: 成功时返回 HTML 内容,失败时返回 None
|
|
||||||
- failures: 各下载器的失败原因列表
|
|
||||||
"""
|
|
||||||
failures = []
|
|
||||||
content = None
|
|
||||||
|
|
||||||
# 按优先级尝试各下载器
|
|
||||||
downloaders = [
|
|
||||||
("pyppeteer", download_with_pyppeteer),
|
|
||||||
("selenium", download_with_selenium),
|
|
||||||
("httpx", download_with_httpx),
|
|
||||||
("urllib", download_with_urllib),
|
|
||||||
]
|
|
||||||
|
|
||||||
for name, func in downloaders:
|
|
||||||
content, error = func(url)
|
|
||||||
if content is not None:
|
|
||||||
return content, failures
|
|
||||||
else:
|
|
||||||
failures.append(f"- {name}: {error}")
|
|
||||||
|
|
||||||
return None, failures
|
|
||||||
39
scripts/readers/html/downloader/__init__.py
Normal file
39
scripts/readers/html/downloader/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""HTML 下载器子包,支持多种下载方式按优先级降级"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple, List
|
||||||
|
|
||||||
|
from . import pyppeteer
|
||||||
|
from . import selenium
|
||||||
|
from . import httpx
|
||||||
|
from . import urllib
|
||||||
|
|
||||||
|
|
||||||
|
DOWNLOADERS = [
|
||||||
|
("pyppeteer", pyppeteer.download),
|
||||||
|
("selenium", selenium.download),
|
||||||
|
("httpx", httpx.download),
|
||||||
|
("urllib", urllib.download),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def download_html(url: str) -> Tuple[Optional[str], List[str]]:
|
||||||
|
"""
|
||||||
|
统一的 HTML 下载入口,按优先级尝试各下载器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, failures): content 成功时为 HTML 内容,所有失败时为 None
|
||||||
|
failures 各下载器的失败原因列表
|
||||||
|
"""
|
||||||
|
failures: List[str] = []
|
||||||
|
|
||||||
|
for name, func in DOWNLOADERS:
|
||||||
|
content, error = func(url)
|
||||||
|
if content is not None:
|
||||||
|
return content, failures
|
||||||
|
else:
|
||||||
|
failures.append(f"- {name}: {error}")
|
||||||
|
|
||||||
|
return None, failures
|
||||||
65
scripts/readers/html/downloader/common.py
Normal file
65
scripts/readers/html/downloader/common.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""下载器公共配置"""
|
||||||
|
|
||||||
|
# 公共配置
|
||||||
|
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
WINDOW_SIZE = "1920,1080"
|
||||||
|
LANGUAGE_SETTING = "zh-CN,zh"
|
||||||
|
|
||||||
|
# Chrome 浏览器启动参数(pyppeteer 和 selenium 共用)
|
||||||
|
CHROME_ARGS = [
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--disable-extensions",
|
||||||
|
"--disable-background-networking",
|
||||||
|
"--disable-default-apps",
|
||||||
|
"--disable-sync",
|
||||||
|
"--disable-translate",
|
||||||
|
"--hide-scrollbars",
|
||||||
|
"--metrics-recording-only",
|
||||||
|
"--mute-audio",
|
||||||
|
"--no-first-run",
|
||||||
|
"--safebrowsing-disable-auto-update",
|
||||||
|
"--blink-settings=imagesEnabled=false",
|
||||||
|
"--disable-plugins",
|
||||||
|
"--disable-ipc-flooding-protection",
|
||||||
|
"--disable-renderer-backgrounding",
|
||||||
|
"--disable-background-timer-throttling",
|
||||||
|
"--disable-hang-monitor",
|
||||||
|
"--disable-prompt-on-repost",
|
||||||
|
"--disable-client-side-phishing-detection",
|
||||||
|
"--disable-component-update",
|
||||||
|
"--disable-domain-reliability",
|
||||||
|
"--disable-features=site-per-process",
|
||||||
|
"--disable-features=IsolateOrigins",
|
||||||
|
"--disable-features=VizDisplayCompositor",
|
||||||
|
"--disable-features=WebRTC",
|
||||||
|
f"--window-size={WINDOW_SIZE}",
|
||||||
|
f"--lang={LANGUAGE_SETTING}",
|
||||||
|
f"--user-agent={USER_AGENT}",
|
||||||
|
]
|
||||||
|
|
||||||
|
# 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用)
|
||||||
|
HIDE_AUTOMATION_SCRIPT = """
|
||||||
|
() => {
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||||
|
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理)
|
||||||
|
HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
|
||||||
|
() => {
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||||
|
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
||||||
|
const originalQuery = window.navigator.permissions.query;
|
||||||
|
window.navigator.permissions.query = (parameters) => (
|
||||||
|
parameters.name === 'notifications' ?
|
||||||
|
Promise.resolve({ state: Notification.permission }) :
|
||||||
|
originalQuery(parameters)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
"""
|
||||||
38
scripts/readers/html/downloader/httpx.py
Normal file
38
scripts/readers/html/downloader/httpx.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
"""使用 httpx 下载 URL(轻量级 HTTP 客户端)"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 httpx 下载 URL(轻量级 HTTP 客户端)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
except ImportError:
|
||||||
|
return None, "httpx 库未安装"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with httpx.Client(timeout=30.0) as client:
|
||||||
|
response = client.get(url, headers=headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
content = response.text
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
return None, f"HTTP {response.status_code}"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"httpx 下载失败: {str(e)}"
|
||||||
65
scripts/readers/html/downloader/pyppeteer.py
Normal file
65
scripts/readers/html/downloader/pyppeteer.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""使用 pyppeteer 下载 URL(支持 JS 渲染)"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import (
|
||||||
|
USER_AGENT,
|
||||||
|
CHROME_ARGS,
|
||||||
|
HIDE_AUTOMATION_SCRIPT_PUPPETEER
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 pyppeteer 下载 URL(支持 JS 渲染)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pyppeteer import launch
|
||||||
|
except ImportError:
|
||||||
|
return None, "pyppeteer 库未安装"
|
||||||
|
|
||||||
|
async def _download():
|
||||||
|
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
|
||||||
|
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
||||||
|
if not chromium_path:
|
||||||
|
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
|
||||||
|
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
|
||||||
|
|
||||||
|
browser = None
|
||||||
|
try:
|
||||||
|
browser = await launch(
|
||||||
|
headless=True,
|
||||||
|
executablePath=executable_path,
|
||||||
|
args=CHROME_ARGS
|
||||||
|
)
|
||||||
|
page = await browser.newPage()
|
||||||
|
|
||||||
|
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
|
||||||
|
|
||||||
|
await page.setJavaScriptEnabled(True)
|
||||||
|
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
|
||||||
|
return await page.content()
|
||||||
|
finally:
|
||||||
|
if browser is not None:
|
||||||
|
try:
|
||||||
|
await browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = asyncio.run(_download())
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"pyppeteer 下载失败: {str(e)}"
|
||||||
92
scripts/readers/html/downloader/selenium.py
Normal file
92
scripts/readers/html/downloader/selenium.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""使用 selenium 下载 URL(支持 JS 渲染)"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import (
|
||||||
|
USER_AGENT,
|
||||||
|
CHROME_ARGS,
|
||||||
|
HIDE_AUTOMATION_SCRIPT
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 selenium 下载 URL(支持 JS 渲染)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
except ImportError:
|
||||||
|
return None, "selenium 库未安装"
|
||||||
|
|
||||||
|
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
|
||||||
|
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
||||||
|
|
||||||
|
if not driver_path or not os.path.exists(driver_path):
|
||||||
|
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
|
||||||
|
if not binary_path or not os.path.exists(binary_path):
|
||||||
|
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.binary_location = binary_path
|
||||||
|
chrome_options.add_argument("--headless=new")
|
||||||
|
for arg in CHROME_ARGS:
|
||||||
|
chrome_options.add_argument(arg)
|
||||||
|
|
||||||
|
# 隐藏自动化特征
|
||||||
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
chrome_options.add_experimental_option("useAutomationExtension", False)
|
||||||
|
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
import time
|
||||||
|
service = Service(driver_path)
|
||||||
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
|
||||||
|
# 隐藏 webdriver 属性
|
||||||
|
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||||
|
"source": HIDE_AUTOMATION_SCRIPT
|
||||||
|
})
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# 等待页面内容稳定
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||||
|
)
|
||||||
|
|
||||||
|
last_len = 0
|
||||||
|
stable_count = 0
|
||||||
|
for _ in range(30):
|
||||||
|
current_len = len(driver.page_source)
|
||||||
|
if current_len == last_len:
|
||||||
|
stable_count += 1
|
||||||
|
if stable_count >= 2:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
stable_count = 0
|
||||||
|
last_len = current_len
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
content = driver.page_source
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"selenium 下载失败: {str(e)}"
|
||||||
|
finally:
|
||||||
|
if driver is not None:
|
||||||
|
try:
|
||||||
|
driver.quit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
35
scripts/readers/html/downloader/urllib.py
Normal file
35
scripts/readers/html/downloader/urllib.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""使用 urllib 下载 URL(标准库,兜底方案)"""
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 urllib 下载 URL(标准库,兜底方案)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url, headers=headers)
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
content = response.read().decode("utf-8")
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
return None, f"HTTP {response.status}"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"urllib 下载失败: {str(e)}"
|
||||||
@@ -3,8 +3,8 @@
|
|||||||
import os
|
import os
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.readers.base import BaseReader
|
from readers.base import BaseReader
|
||||||
from scripts.utils import is_valid_pdf
|
from utils import is_valid_pdf
|
||||||
|
|
||||||
from . import docling_ocr
|
from . import docling_ocr
|
||||||
from . import unstructured_ocr
|
from . import unstructured_ocr
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_markitdown
|
from readers._utils import parse_via_markitdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 MarkItDown 库解析 PDF 文件"""
|
"""使用 MarkItDown 库解析 PDF 文件"""
|
||||||
return parse_with_markitdown(file_path)
|
return parse_via_markitdown(file_path)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import _unstructured_elements_to_markdown
|
from readers._utils import convert_unstructured_to_markdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
@@ -20,7 +20,7 @@ def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|||||||
languages=["chi_sim"],
|
languages=["chi_sim"],
|
||||||
)
|
)
|
||||||
# fast 策略不做版面分析,Title 类型标注不可靠
|
# fast 策略不做版面分析,Title 类型标注不可靠
|
||||||
content = _unstructured_elements_to_markdown(elements, trust_titles=False)
|
content = convert_unstructured_to_markdown(elements, trust_titles=False)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
return None, "文档为空"
|
return None, "文档为空"
|
||||||
return content, None
|
return content, None
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import _unstructured_elements_to_markdown
|
from readers._utils import convert_unstructured_to_markdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
@@ -26,7 +26,7 @@ def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|||||||
ocr_agent=OCR_AGENT_PADDLE,
|
ocr_agent=OCR_AGENT_PADDLE,
|
||||||
table_ocr_agent=OCR_AGENT_PADDLE,
|
table_ocr_agent=OCR_AGENT_PADDLE,
|
||||||
)
|
)
|
||||||
content = _unstructured_elements_to_markdown(elements, trust_titles=True)
|
content = convert_unstructured_to_markdown(elements, trust_titles=True)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
return None, "文档为空"
|
return None, "文档为空"
|
||||||
return content, None
|
return content, None
|
||||||
|
|||||||
46
scripts/readers/ppt/__init__.py
Normal file
46
scripts/readers/ppt/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""PPT 文件阅读器,使用 LibreOffice 解析。"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
from readers.base import BaseReader
|
||||||
|
from utils import is_valid_ppt
|
||||||
|
|
||||||
|
from . import libreoffice
|
||||||
|
|
||||||
|
|
||||||
|
PARSERS = [
|
||||||
|
("LibreOffice", libreoffice.parse),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class PptReader(BaseReader):
|
||||||
|
"""PPT 文件阅读器"""
|
||||||
|
|
||||||
|
def supports(self, file_path: str) -> bool:
|
||||||
|
return file_path.lower().endswith('.ppt')
|
||||||
|
|
||||||
|
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
|
||||||
|
failures = []
|
||||||
|
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
return None, ["文件不存在"]
|
||||||
|
|
||||||
|
# 验证文件格式
|
||||||
|
if not is_valid_ppt(file_path):
|
||||||
|
return None, ["不是有效的 PPT 文件"]
|
||||||
|
|
||||||
|
content = None
|
||||||
|
|
||||||
|
for parser_name, parser_func in PARSERS:
|
||||||
|
try:
|
||||||
|
content, error = parser_func(file_path)
|
||||||
|
if content is not None:
|
||||||
|
return content, failures
|
||||||
|
else:
|
||||||
|
failures.append(f"- {parser_name}: {error}")
|
||||||
|
except Exception as e:
|
||||||
|
failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
|
return None, failures
|
||||||
37
scripts/readers/ppt/libreoffice.py
Normal file
37
scripts/readers/ppt/libreoffice.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
"""使用 LibreOffice soffice 命令行转换 PPT 为 PPTX 后复用 PptxReader 解析"""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from readers._utils import convert_via_libreoffice
|
||||||
|
from readers.pptx import PptxReader
|
||||||
|
|
||||||
|
|
||||||
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 LibreOffice soffice 解析 PPT 文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: PPT 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(markdown_content, error_message): 成功时 (content, None),失败时 (None, error)
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
# 将 PPT 转换为 PPTX
|
||||||
|
pptx_path, error = convert_via_libreoffice(
|
||||||
|
input_path=file_path,
|
||||||
|
target_format="pptx",
|
||||||
|
output_dir=Path(temp_dir),
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
if error:
|
||||||
|
return None, error
|
||||||
|
|
||||||
|
# 复用 PptxReader 解析转换后的 PPTX
|
||||||
|
reader = PptxReader()
|
||||||
|
content, failures = reader.parse(str(pptx_path))
|
||||||
|
if content is not None:
|
||||||
|
return content, None
|
||||||
|
else:
|
||||||
|
return None, f"转换成功但 PPTX 解析失败: {failures}"
|
||||||
@@ -3,8 +3,8 @@
|
|||||||
import os
|
import os
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.readers.base import BaseReader
|
from readers.base import BaseReader
|
||||||
from scripts.utils import is_valid_pptx
|
from utils import is_valid_pptx
|
||||||
|
|
||||||
from . import docling
|
from . import docling
|
||||||
from . import unstructured
|
from . import unstructured
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_docling
|
from readers._utils import parse_via_docling
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 docling 库解析 PPTX 文件"""
|
"""使用 docling 库解析 PPTX 文件"""
|
||||||
return parse_with_docling(file_path)
|
return parse_via_docling(file_path)
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_markitdown
|
from readers._utils import parse_via_markitdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 MarkItDown 库解析 PPTX 文件"""
|
"""使用 MarkItDown 库解析 PPTX 文件"""
|
||||||
return parse_with_markitdown(file_path)
|
return parse_via_markitdown(file_path)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
|
|||||||
import zipfile
|
import zipfile
|
||||||
from typing import Any, List, Optional, Tuple
|
from typing import Any, List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import build_markdown_table, flush_list_stack
|
from readers._utils import build_markdown_table, flush_list_stack
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Any, List, Optional, Tuple
|
from typing import Any, List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import build_markdown_table, flush_list_stack
|
from readers._utils import build_markdown_table, flush_list_stack
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import _unstructured_elements_to_markdown
|
from readers._utils import convert_unstructured_to_markdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
@@ -16,7 +16,7 @@ def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|||||||
elements = partition_pptx(
|
elements = partition_pptx(
|
||||||
filename=file_path, infer_table_structure=True, include_metadata=True
|
filename=file_path, infer_table_structure=True, include_metadata=True
|
||||||
)
|
)
|
||||||
content = _unstructured_elements_to_markdown(elements)
|
content = convert_unstructured_to_markdown(elements)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
return None, "文档为空"
|
return None, "文档为空"
|
||||||
return content, None
|
return content, None
|
||||||
|
|||||||
50
scripts/readers/xls/__init__.py
Normal file
50
scripts/readers/xls/__init__.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
"""XLS 文件阅读器,支持多种解析方法。"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
from readers.base import BaseReader
|
||||||
|
from utils import is_valid_xls
|
||||||
|
|
||||||
|
from . import unstructured
|
||||||
|
from . import markitdown
|
||||||
|
from . import pandas
|
||||||
|
|
||||||
|
|
||||||
|
PARSERS = [
|
||||||
|
("unstructured", unstructured.parse),
|
||||||
|
("MarkItDown", markitdown.parse),
|
||||||
|
("pandas+xlrd", pandas.parse),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class XlsReader(BaseReader):
|
||||||
|
"""XLS 文件阅读器"""
|
||||||
|
|
||||||
|
def supports(self, file_path: str) -> bool:
|
||||||
|
return file_path.lower().endswith('.xls')
|
||||||
|
|
||||||
|
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
|
||||||
|
failures = []
|
||||||
|
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
return None, ["文件不存在"]
|
||||||
|
|
||||||
|
# 验证文件格式
|
||||||
|
if not is_valid_xls(file_path):
|
||||||
|
return None, ["不是有效的 XLS 文件"]
|
||||||
|
|
||||||
|
content = None
|
||||||
|
|
||||||
|
for parser_name, parser_func in PARSERS:
|
||||||
|
try:
|
||||||
|
content, error = parser_func(file_path)
|
||||||
|
if content is not None:
|
||||||
|
return content, failures
|
||||||
|
else:
|
||||||
|
failures.append(f"- {parser_name}: {error}")
|
||||||
|
except Exception as e:
|
||||||
|
failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
|
return None, failures
|
||||||
10
scripts/readers/xls/markitdown.py
Normal file
10
scripts/readers/xls/markitdown.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
"""使用 MarkItDown 库解析 XLS 文件"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from readers._utils import parse_via_markitdown
|
||||||
|
|
||||||
|
|
||||||
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 MarkItDown 库解析 XLS 文件"""
|
||||||
|
return parse_via_markitdown(file_path)
|
||||||
41
scripts/readers/xls/pandas.py
Normal file
41
scripts/readers/xls/pandas.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""使用 pandas+xlrd 库解析 XLS 文件"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 pandas+xlrd 库解析 XLS 文件"""
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
except ImportError as e:
|
||||||
|
if "pandas" in str(e):
|
||||||
|
missing_lib = "pandas"
|
||||||
|
elif "xlrd" in str(e):
|
||||||
|
missing_lib = "xlrd"
|
||||||
|
else:
|
||||||
|
missing_lib = "tabulate"
|
||||||
|
return None, f"{missing_lib} 库未安装"
|
||||||
|
|
||||||
|
try:
|
||||||
|
sheets = pd.read_excel(file_path, sheet_name=None, engine="xlrd")
|
||||||
|
|
||||||
|
markdown_parts = []
|
||||||
|
for sheet_name, df in sheets.items():
|
||||||
|
if len(df) == 0:
|
||||||
|
markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*")
|
||||||
|
continue
|
||||||
|
|
||||||
|
table_md = tabulate(
|
||||||
|
df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
|
||||||
|
)
|
||||||
|
markdown_parts.append(f"## {sheet_name}\n\n{table_md}")
|
||||||
|
|
||||||
|
if not markdown_parts:
|
||||||
|
return None, "Excel 文件为空"
|
||||||
|
|
||||||
|
markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts)
|
||||||
|
|
||||||
|
return markdown_content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"pandas 解析失败: {str(e)}"
|
||||||
22
scripts/readers/xls/unstructured.py
Normal file
22
scripts/readers/xls/unstructured.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
"""使用 unstructured 库解析 XLS 文件"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from readers._utils import convert_unstructured_to_markdown
|
||||||
|
|
||||||
|
|
||||||
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""使用 unstructured 库解析 XLS 文件"""
|
||||||
|
try:
|
||||||
|
from unstructured.partition.xlsx import partition_xlsx
|
||||||
|
except ImportError:
|
||||||
|
return None, "unstructured 库未安装"
|
||||||
|
|
||||||
|
try:
|
||||||
|
elements = partition_xlsx(filename=file_path, infer_table_structure=True)
|
||||||
|
content = convert_unstructured_to_markdown(elements)
|
||||||
|
if not content.strip():
|
||||||
|
return None, "文档为空"
|
||||||
|
return content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"unstructured 解析失败: {str(e)}"
|
||||||
@@ -3,8 +3,8 @@
|
|||||||
import os
|
import os
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.readers.base import BaseReader
|
from readers.base import BaseReader
|
||||||
from scripts.utils import is_valid_xlsx
|
from utils import is_valid_xlsx
|
||||||
|
|
||||||
from . import docling
|
from . import docling
|
||||||
from . import unstructured
|
from . import unstructured
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_docling
|
from readers._utils import parse_via_docling
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 docling 库解析 XLSX 文件"""
|
"""使用 docling 库解析 XLSX 文件"""
|
||||||
return parse_with_docling(file_path)
|
return parse_via_docling(file_path)
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import parse_with_markitdown
|
from readers._utils import parse_via_markitdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""使用 MarkItDown 库解析 XLSX 文件"""
|
"""使用 MarkItDown 库解析 XLSX 文件"""
|
||||||
return parse_with_markitdown(file_path)
|
return parse_via_markitdown(file_path)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
|
|||||||
import zipfile
|
import zipfile
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import build_markdown_table, safe_open_zip
|
from readers._utils import build_markdown_table, safe_open_zip
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.core import _unstructured_elements_to_markdown
|
from readers._utils import convert_unstructured_to_markdown
|
||||||
|
|
||||||
|
|
||||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
@@ -14,7 +14,7 @@ def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
elements = partition_xlsx(filename=file_path, infer_table_structure=True)
|
elements = partition_xlsx(filename=file_path, infer_table_structure=True)
|
||||||
content = _unstructured_elements_to_markdown(elements)
|
content = convert_unstructured_to_markdown(elements)
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
return None, "文档为空"
|
return None, "文档为空"
|
||||||
return content, None
|
return content, None
|
||||||
|
|||||||
@@ -1,21 +1,25 @@
|
|||||||
"""Utils module for lyxy-document."""
|
"""Utils module for lyxy-document."""
|
||||||
|
|
||||||
from .file_detection import (
|
from .file_detection import (
|
||||||
|
is_valid_doc,
|
||||||
is_valid_docx,
|
is_valid_docx,
|
||||||
is_valid_pptx,
|
is_valid_pptx,
|
||||||
is_valid_xlsx,
|
is_valid_xlsx,
|
||||||
is_valid_pdf,
|
is_valid_pdf,
|
||||||
|
is_valid_xls,
|
||||||
|
is_valid_ppt,
|
||||||
is_html_file,
|
is_html_file,
|
||||||
is_url,
|
is_url,
|
||||||
detect_file_type,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"is_valid_doc",
|
||||||
"is_valid_docx",
|
"is_valid_docx",
|
||||||
"is_valid_pptx",
|
"is_valid_pptx",
|
||||||
"is_valid_xlsx",
|
"is_valid_xlsx",
|
||||||
"is_valid_pdf",
|
"is_valid_pdf",
|
||||||
|
"is_valid_xls",
|
||||||
|
"is_valid_ppt",
|
||||||
"is_html_file",
|
"is_html_file",
|
||||||
"is_url",
|
"is_url",
|
||||||
"detect_file_type",
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from scripts.config import Config
|
from config import Config
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|||||||
@@ -5,6 +5,19 @@ import zipfile
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def _is_valid_ole(file_path: str) -> bool:
|
||||||
|
"""验证 OLE2 格式文件(XLS/DOC)"""
|
||||||
|
try:
|
||||||
|
import olefile
|
||||||
|
except ImportError:
|
||||||
|
# 如果 olefile 未安装,就不做严格验证
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
return olefile.isOleFile(file_path)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _is_valid_ooxml(file_path: str, required_files: List[str]) -> bool:
|
def _is_valid_ooxml(file_path: str, required_files: List[str]) -> bool:
|
||||||
"""验证 OOXML 格式文件(DOCX/PPTX/XLSX)"""
|
"""验证 OOXML 格式文件(DOCX/PPTX/XLSX)"""
|
||||||
try:
|
try:
|
||||||
@@ -35,6 +48,21 @@ def is_valid_xlsx(file_path: str) -> bool:
|
|||||||
return _is_valid_ooxml(file_path, _XLSX_REQUIRED)
|
return _is_valid_ooxml(file_path, _XLSX_REQUIRED)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_xls(file_path: str) -> bool:
|
||||||
|
"""验证文件是否为有效的 XLS 格式"""
|
||||||
|
return _is_valid_ole(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_doc(file_path: str) -> bool:
|
||||||
|
"""验证文件是否为有效的 DOC 格式(OLE2)"""
|
||||||
|
return _is_valid_ole(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_ppt(file_path: str) -> bool:
|
||||||
|
"""验证文件是否为有效的 PPT 格式(OLE2)"""
|
||||||
|
return _is_valid_ole(file_path)
|
||||||
|
|
||||||
|
|
||||||
def is_valid_pdf(file_path: str) -> bool:
|
def is_valid_pdf(file_path: str) -> bool:
|
||||||
"""验证文件是否为有效的 PDF 格式"""
|
"""验证文件是否为有效的 PDF 格式"""
|
||||||
try:
|
try:
|
||||||
@@ -54,20 +82,3 @@ def is_html_file(file_path: str) -> bool:
|
|||||||
def is_url(input_str: str) -> bool:
|
def is_url(input_str: str) -> bool:
|
||||||
"""判断输入是否为 URL"""
|
"""判断输入是否为 URL"""
|
||||||
return input_str.startswith("http://") or input_str.startswith("https://")
|
return input_str.startswith("http://") or input_str.startswith("https://")
|
||||||
|
|
||||||
|
|
||||||
_FILE_TYPE_VALIDATORS = {
|
|
||||||
".docx": is_valid_docx,
|
|
||||||
".pptx": is_valid_pptx,
|
|
||||||
".xlsx": is_valid_xlsx,
|
|
||||||
".pdf": is_valid_pdf,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def detect_file_type(file_path: str) -> Optional[str]:
|
|
||||||
"""检测文件类型,返回 'docx'、'pptx'、'xlsx' 或 'pdf'"""
|
|
||||||
ext = os.path.splitext(file_path)[1].lower()
|
|
||||||
validator = _FILE_TYPE_VALIDATORS.get(ext)
|
|
||||||
if validator and validator(file_path):
|
|
||||||
return ext.lstrip(".")
|
|
||||||
return None
|
|
||||||
|
|||||||
156
skill/SKILL.md
156
skill/SKILL.md
@@ -1,156 +0,0 @@
|
|||||||
---
|
|
||||||
name: lyxy-document-reader
|
|
||||||
description: 统一文档解析工具 - 将 DOCX、XLSX、PPTX、PDF、HTML/URL 转换为 Markdown。支持全文输出、字数统计、行数统计、标题提取、章节提取、正则搜索。当用户要求"读取/解析/打开文档"、上传 .docx/.xlsx/.pptx/.pdf/.html 文件、或提供 URL 时使用。
|
|
||||||
license: MIT
|
|
||||||
metadata:
|
|
||||||
version: "1.0"
|
|
||||||
author: lyxy
|
|
||||||
compatibility: Requires Python 3.11+. 优先使用 lyxy-runner-python skill 执行(自动管理依赖)。回退到主机 Python 时需手动安装依赖:DOCX(docling unstructured markitdown pypandoc-binary python-docx markdownify chardet) / XLSX(docling unstructured markitdown pandas tabulate chardet) / PPTX(docling unstructured markitdown python-pptx markdownify chardet) / PDF(docling unstructured unstructured-paddleocr markitdown pypdf markdownify chardet) / HTML(trafilatura domscribe markitdown html2text beautifulsoup4 httpx chardet) / HTTP增强(pyppeteer selenium)
|
|
||||||
---
|
|
||||||
|
|
||||||
# 统一文档解析 Skill
|
|
||||||
|
|
||||||
将 DOCX、XLSX、PPTX、PDF、HTML 文件或 URL 网页内容解析为 Markdown 格式,支持多种查询模式。
|
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
**统一入口**:使用 `scripts/lyxy_document_reader.py` 作为统一的命令行入口,自动识别文件类型并执行解析。
|
|
||||||
|
|
||||||
**双路径执行**:此 skill 必须优先使用 **lyxy-runner-python skill** 执行脚本,该 skill 会自动管理 uv 隔离环境和依赖。当 lyxy-runner-python 不可用时,回退到主机 Python 环境执行。
|
|
||||||
|
|
||||||
**支持的文档类型**:
|
|
||||||
- **DOCX**:Word 文档
|
|
||||||
- **XLSX**:Excel 表格
|
|
||||||
- **PPTX**:PowerPoint 演示文稿
|
|
||||||
- **PDF**:PDF 文档(支持 OCR)
|
|
||||||
- **HTML/URL**:HTML 文件或网页地址
|
|
||||||
|
|
||||||
## When to Use
|
|
||||||
|
|
||||||
任何需要读取或解析 Office 文档、PDF、HTML 文件、URL 网页内容的任务都应使用此 skill。
|
|
||||||
|
|
||||||
### 典型场景
|
|
||||||
- **文档转换**:将各类文档转换为可读的 Markdown 文本
|
|
||||||
- **文档元数据**:获取文档的字数、行数等信息
|
|
||||||
- **标题分析**:提取文档的标题结构
|
|
||||||
- **章节提取**:提取特定章节的内容
|
|
||||||
- **内容搜索**:在文档中搜索关键词或正则模式
|
|
||||||
|
|
||||||
### 触发词
|
|
||||||
- **中文**:"读取/解析/打开 文档/Word/Excel/PPT/PDF/网页"
|
|
||||||
- **英文**:"read/parse/extract document/docx/xlsx/pptx/pdf/html"
|
|
||||||
- **文件扩展名**:`.docx`、`.xlsx`、`.pptx`、`.pdf`、`.html`、`.htm`
|
|
||||||
- **URL 模式**:`http://`、`https://`
|
|
||||||
|
|
||||||
## Quick Reference
|
|
||||||
|
|
||||||
| 参数 | 说明 |
|
|
||||||
|------|------|
|
|
||||||
| (无参数) | 输出完整 Markdown 内容 |
|
|
||||||
| `-c` / `--count` | 字数统计 |
|
|
||||||
| `-l` / `--lines` | 行数统计 |
|
|
||||||
| `-t` / `--titles` | 提取所有标题(1-6级) |
|
|
||||||
| `-tc <name>` | 提取指定标题的章节内容 |
|
|
||||||
| `-s <pattern>` | 正则表达式搜索 |
|
|
||||||
| `-n <num>` / `--context <num>` | 与 `-s` 配合,指定上下文行数(默认2) |
|
|
||||||
|
|
||||||
## Workflow
|
|
||||||
|
|
||||||
1. **检测执行环境**:
|
|
||||||
- 优先检测 **lyxy-runner-python skill** 是否可用
|
|
||||||
- 可用 → 使用 uv 隔离环境执行
|
|
||||||
- 不可用 → 回退到主机 Python 环境
|
|
||||||
|
|
||||||
2. **识别文件类型**:
|
|
||||||
- 根据文件扩展名自动选择对应的解析器
|
|
||||||
- URL 自动识别为 HTML/网页类型
|
|
||||||
|
|
||||||
3. **执行解析**:
|
|
||||||
- 按优先级尝试多个解析器,直到成功
|
|
||||||
- DOCX:docling → unstructured → pypandoc → MarkItDown → python-docx → XML
|
|
||||||
- XLSX:docling → unstructured → MarkItDown → pandas → XML
|
|
||||||
- PPTX:docling → unstructured → MarkItDown → python-pptx → XML
|
|
||||||
- PDF:docling OCR → unstructured OCR → docling → unstructured → MarkItDown → pypdf
|
|
||||||
- HTML:trafilatura → domscribe → MarkItDown → html2text
|
|
||||||
|
|
||||||
4. **输出结果**:
|
|
||||||
- 返回 Markdown 格式内容或统计信息
|
|
||||||
|
|
||||||
### 基本语法
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 方式 1:使用 lyxy-runner-python(推荐)
|
|
||||||
# lyxy-runner-python 会自动分析脚本依赖并使用 uv --with 安装
|
|
||||||
# AI 只需执行:
|
|
||||||
python scripts/lyxy_document_reader.py <文件路径或URL>
|
|
||||||
|
|
||||||
# 方式 2:回退到主机 Python(需要预先手动安装依赖)
|
|
||||||
# 根据文档类型安装对应依赖后执行:
|
|
||||||
python scripts/lyxy_document_reader.py <文件路径或URL>
|
|
||||||
```
|
|
||||||
|
|
||||||
### 使用示例
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 读取 Word 文档
|
|
||||||
python scripts/lyxy_document_reader.py document.docx
|
|
||||||
|
|
||||||
# 解析网页内容
|
|
||||||
python scripts/lyxy_document_reader.py https://example.com
|
|
||||||
|
|
||||||
# 统计字数
|
|
||||||
python scripts/lyxy_document_reader.py document.docx -c
|
|
||||||
|
|
||||||
# 提取所有标题
|
|
||||||
python scripts/lyxy_document_reader.py document.docx -t
|
|
||||||
|
|
||||||
# 提取指定章节
|
|
||||||
python scripts/lyxy_document_reader.py document.docx -tc "第三章"
|
|
||||||
|
|
||||||
# 搜索内容
|
|
||||||
python scripts/lyxy_document_reader.py document.docx -s "关键词"
|
|
||||||
|
|
||||||
# 正则搜索
|
|
||||||
python scripts/lyxy_document_reader.py document.docx -s "\d{4}-\d{2}-\d{2}"
|
|
||||||
|
|
||||||
# 指定上下文行数
|
|
||||||
python scripts/lyxy_document_reader.py document.docx -s "关键词" -n 5
|
|
||||||
```
|
|
||||||
|
|
||||||
### 主机 Python 环境依赖安装
|
|
||||||
|
|
||||||
当 lyxy-runner-python 不可用时,需要根据文档类型手动安装依赖:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# DOCX 文档
|
|
||||||
pip install docling unstructured markitdown pypandoc-binary python-docx markdownify chardet
|
|
||||||
|
|
||||||
# XLSX 表格
|
|
||||||
pip install docling unstructured markitdown pandas tabulate chardet
|
|
||||||
|
|
||||||
# PPTX 演示文稿
|
|
||||||
pip install docling unstructured markitdown python-pptx markdownify chardet
|
|
||||||
|
|
||||||
# PDF 文档
|
|
||||||
pip install docling unstructured unstructured-paddleocr markitdown pypdf markdownify chardet
|
|
||||||
|
|
||||||
# HTML/URL 网页
|
|
||||||
pip install trafilatura domscribe markitdown html2text beautifulsoup4 httpx chardet
|
|
||||||
|
|
||||||
# 网页(需要 JS 渲染时,额外添加)
|
|
||||||
pip install pyppeteer selenium
|
|
||||||
|
|
||||||
# 安装所有文档类型支持
|
|
||||||
pip install docling unstructured unstructured-paddleocr markitdown pypandoc-binary python-docx python-pptx pandas tabulate pypdf markdownify trafilatura domscribe html2text beautifulsoup4 httpx pyppeteer selenium chardet
|
|
||||||
```
|
|
||||||
|
|
||||||
## 错误处理
|
|
||||||
|
|
||||||
| 错误信息 | 原因 | 解决 |
|
|
||||||
|---------|------|------|
|
|
||||||
| 错误: input_path 不能为空 | 未提供输入 | 提供 file_path 或 URL |
|
|
||||||
| 错误: 不支持的文件类型 | 无对应 reader | 检查文件扩展名 |
|
|
||||||
| 所有解析方法均失败 | 所有解析器失败 | 检查文件是否损坏 |
|
|
||||||
| 错误: 无效的正则表达式 | 正则语法错误 | 检查正则语法 |
|
|
||||||
| 错误: 未找到匹配 | 搜索无结果 | 检查搜索词或正则 |
|
|
||||||
| ModuleNotFoundError: No module named 'xxx' | 缺少依赖 | 使用 lyxy-runner-python 或 pip 安装对应依赖 |
|
|
||||||
@@ -1 +1,12 @@
|
|||||||
"""Tests package for lyxy-document."""
|
"""Tests package for lyxy-document."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 将 scripts/ 目录添加到 sys.path
|
||||||
|
project_root = Path(__file__).resolve().parent.parent
|
||||||
|
scripts_dir = project_root / "scripts"
|
||||||
|
if str(scripts_dir) not in sys.path:
|
||||||
|
sys.path.insert(0, str(scripts_dir))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,16 @@
|
|||||||
"""测试配置和共享 fixtures。"""
|
"""测试配置和共享 fixtures。"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 将 scripts/ 目录添加到 sys.path(必须在最顶部,在其他导入之前)
|
||||||
|
project_root = Path(__file__).resolve().parent.parent # tests/ 的父目录是项目根目录
|
||||||
|
scripts_dir = project_root / "scripts"
|
||||||
|
if str(scripts_dir) not in sys.path:
|
||||||
|
sys.path.insert(0, str(scripts_dir))
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from scripts.readers import READERS
|
from readers import READERS
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -42,10 +51,7 @@ def temp_docx(tmp_path):
|
|||||||
str: 临时文件路径
|
str: 临时文件路径
|
||||||
"""
|
"""
|
||||||
def _create_docx(paragraphs=None, headings=None, table_data=None, list_items=None):
|
def _create_docx(paragraphs=None, headings=None, table_data=None, list_items=None):
|
||||||
try:
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
except ImportError:
|
|
||||||
pytest.skip("python-docx 未安装")
|
|
||||||
|
|
||||||
doc = Document()
|
doc = Document()
|
||||||
|
|
||||||
@@ -90,23 +96,38 @@ def temp_pdf(tmp_path):
|
|||||||
str: 临时文件路径
|
str: 临时文件路径
|
||||||
"""
|
"""
|
||||||
def _create_pdf(text=None, lines=None):
|
def _create_pdf(text=None, lines=None):
|
||||||
try:
|
|
||||||
from reportlab.pdfgen import canvas
|
from reportlab.pdfgen import canvas
|
||||||
from reportlab.lib.pagesizes import letter
|
from reportlab.lib.pagesizes import letter
|
||||||
from reportlab.pdfbase import pdfmetrics
|
from reportlab.pdfbase import pdfmetrics
|
||||||
from reportlab.pdfbase.ttfonts import TTFont
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
except ImportError:
|
|
||||||
pytest.skip("reportlab 未安装")
|
|
||||||
|
|
||||||
file_path = tmp_path / "test.pdf"
|
file_path = tmp_path / "test.pdf"
|
||||||
c = canvas.Canvas(str(file_path), pagesize=letter)
|
c = canvas.Canvas(str(file_path), pagesize=letter)
|
||||||
|
|
||||||
# 尝试注册中文字体(如果可用)
|
# 尝试注册中文字体(如果可用)
|
||||||
|
font_loaded = False
|
||||||
try:
|
try:
|
||||||
# 使用系统字体
|
# 尝试 macOS 中文字体
|
||||||
pdfmetrics.registerFont(TTFont('SimSun', 'simsun.ttc'))
|
for font_name, font_path, font_index in [
|
||||||
c.setFont('SimSun', 12)
|
('PingFangSC', '/System/Library/Fonts/PingFang.ttc', 0),
|
||||||
except:
|
('STHeiti', '/System/Library/Fonts/STHeiti Light.ttc', 0),
|
||||||
|
('STHeitiMedium', '/System/Library/Fonts/STHeiti Medium.ttc', 0),
|
||||||
|
]:
|
||||||
|
try:
|
||||||
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
|
import os
|
||||||
|
if os.path.exists(font_path):
|
||||||
|
# For TTC files, we need to specify the font index
|
||||||
|
pdfmetrics.registerFont(TTFont(font_name, font_path, subfontIndex=font_index))
|
||||||
|
c.setFont(font_name, 12)
|
||||||
|
font_loaded = True
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not font_loaded:
|
||||||
# 回退到默认字体
|
# 回退到默认字体
|
||||||
c.setFont('Helvetica', 12)
|
c.setFont('Helvetica', 12)
|
||||||
|
|
||||||
@@ -167,10 +188,7 @@ def temp_pptx(tmp_path):
|
|||||||
str: 临时文件路径
|
str: 临时文件路径
|
||||||
"""
|
"""
|
||||||
def _create_pptx(slides=None):
|
def _create_pptx(slides=None):
|
||||||
try:
|
|
||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
except ImportError:
|
|
||||||
pytest.skip("python-pptx 未安装")
|
|
||||||
|
|
||||||
prs = Presentation()
|
prs = Presentation()
|
||||||
|
|
||||||
@@ -200,10 +218,7 @@ def temp_xlsx(tmp_path):
|
|||||||
str: 临时文件路径
|
str: 临时文件路径
|
||||||
"""
|
"""
|
||||||
def _create_xlsx(data=None):
|
def _create_xlsx(data=None):
|
||||||
try:
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
except ImportError:
|
|
||||||
pytest.skip("pandas 未安装")
|
|
||||||
|
|
||||||
file_path = tmp_path / "test.xlsx"
|
file_path = tmp_path / "test.xlsx"
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from contextlib import redirect_stdout, redirect_stderr
|
from contextlib import redirect_stdout, redirect_stderr
|
||||||
|
|
||||||
@@ -22,7 +23,15 @@ def cli_runner():
|
|||||||
Returns:
|
Returns:
|
||||||
tuple: (stdout, stderr, exit_code)
|
tuple: (stdout, stderr, exit_code)
|
||||||
"""
|
"""
|
||||||
from scripts.lyxy_document_reader import main
|
# 将 scripts/ 目录添加到 sys.path
|
||||||
|
project_root = Path(__file__).resolve().parent.parent.parent # tests/test_cli/ 的父目录是 tests/,再父目录是项目根目录
|
||||||
|
scripts_dir = project_root / "scripts"
|
||||||
|
if str(scripts_dir) not in sys.path:
|
||||||
|
sys.path.insert(0, str(scripts_dir))
|
||||||
|
|
||||||
|
# 直接调用 bootstrap.main() 而不是 lyxy_document_reader.main()
|
||||||
|
# 因为 lyxy_document_reader 会调用 subprocess,无法捕获输出
|
||||||
|
from bootstrap import main
|
||||||
|
|
||||||
# 保存原始 sys.argv 和 sys.exit
|
# 保存原始 sys.argv 和 sys.exit
|
||||||
original_argv = sys.argv
|
original_argv = sys.argv
|
||||||
@@ -39,7 +48,7 @@ def cli_runner():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# 设置命令行参数
|
# 设置命令行参数
|
||||||
sys.argv = ['lyxy_document_reader'] + args
|
sys.argv = ['bootstrap'] + args
|
||||||
sys.exit = mock_exit
|
sys.exit = mock_exit
|
||||||
|
|
||||||
# 捕获输出
|
# 捕获输出
|
||||||
|
|||||||
53
tests/test_cli/test_path_resolution.py
Normal file
53
tests/test_cli/test_path_resolution.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""测试路径解析功能 - 验证从任意路径调用脚本。"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class TestPathResolution:
|
||||||
|
"""测试路径解析逻辑。"""
|
||||||
|
|
||||||
|
def test_project_root_detection(self):
|
||||||
|
"""测试项目根目录检测逻辑。"""
|
||||||
|
# 模拟 lyxy_document_reader.py 中的路径计算逻辑
|
||||||
|
# 获取当前测试文件的路径,然后向上找到项目根
|
||||||
|
test_file = Path(__file__).resolve()
|
||||||
|
tests_dir = test_file.parent.parent # tests/
|
||||||
|
project_root = tests_dir.parent # 项目根
|
||||||
|
|
||||||
|
# 验证我们能正确找到项目根
|
||||||
|
assert (project_root / "scripts").exists()
|
||||||
|
assert (project_root / "scripts" / "lyxy_document_reader.py").exists()
|
||||||
|
assert (project_root / "scripts" / "bootstrap.py").exists()
|
||||||
|
|
||||||
|
def test_bootstrap_path_absolute(self):
|
||||||
|
"""测试 bootstrap.py 路径是绝对路径。"""
|
||||||
|
# 模拟 lyxy_document_reader.py 中的路径计算
|
||||||
|
test_file = Path(__file__).resolve()
|
||||||
|
project_root = test_file.parent.parent.parent # 从 tests/test_cli/ 向上两级
|
||||||
|
scripts_dir = project_root / "scripts"
|
||||||
|
bootstrap_path = scripts_dir / "bootstrap.py"
|
||||||
|
|
||||||
|
# 验证路径是绝对路径
|
||||||
|
assert bootstrap_path.is_absolute()
|
||||||
|
assert bootstrap_path.exists()
|
||||||
|
|
||||||
|
def test_path_independent_from_cwd(self, monkeypatch, tmp_path):
|
||||||
|
"""测试路径计算不依赖当前工作目录。"""
|
||||||
|
# 保存原始路径
|
||||||
|
test_file = Path(__file__).resolve()
|
||||||
|
project_root = test_file.parent.parent.parent
|
||||||
|
scripts_dir = project_root / "scripts"
|
||||||
|
|
||||||
|
# 切换到临时目录
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
|
||||||
|
# 即使在临时目录,我们仍然能通过 __file__ 找到正确的路径
|
||||||
|
# 这里我们模拟 lyxy_document_reader.py 中的逻辑
|
||||||
|
# 注意:实际中 __file__ 是脚本本身的路径,不是测试文件的路径
|
||||||
|
# 这里我们验证原理:__file__ 给出的是脚本位置,与 cwd 无关
|
||||||
|
|
||||||
|
# 验证 scripts_dir 和 bootstrap_path 的计算只依赖 __file__
|
||||||
|
# 这个测试验证的是概念,不是实际的脚本导入
|
||||||
|
assert scripts_dir.is_absolute()
|
||||||
|
assert (scripts_dir / "bootstrap.py").exists()
|
||||||
198
tests/test_core/test_advice_generator.py
Normal file
198
tests/test_core/test_advice_generator.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
"""测试 advice_generator 模块。"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from core.advice_generator import (
|
||||||
|
detect_file_type_light,
|
||||||
|
get_platform,
|
||||||
|
get_dependencies,
|
||||||
|
generate_uv_command,
|
||||||
|
generate_python_command,
|
||||||
|
format_advice,
|
||||||
|
generate_advice,
|
||||||
|
)
|
||||||
|
from readers import READERS, PdfReader, DocxReader, HtmlReader
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def readers():
|
||||||
|
"""提供已实例化的 readers 列表。"""
|
||||||
|
return [ReaderCls() for ReaderCls in READERS]
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetectFileTypeLight:
|
||||||
|
"""测试轻量文件类型检测函数。"""
|
||||||
|
|
||||||
|
def test_detect_pdf(self, readers):
|
||||||
|
"""测试检测 PDF 文件。"""
|
||||||
|
reader_cls = detect_file_type_light("test.pdf", readers)
|
||||||
|
assert reader_cls == PdfReader
|
||||||
|
|
||||||
|
def test_detect_docx(self, readers):
|
||||||
|
"""测试检测 DOCX 文件。"""
|
||||||
|
reader_cls = detect_file_type_light("test.docx", readers)
|
||||||
|
assert reader_cls == DocxReader
|
||||||
|
|
||||||
|
def test_detect_html(self, readers):
|
||||||
|
"""测试检测 HTML 文件。"""
|
||||||
|
reader_cls = detect_file_type_light("test.html", readers)
|
||||||
|
assert reader_cls == HtmlReader
|
||||||
|
|
||||||
|
def test_detect_url(self, readers):
|
||||||
|
"""测试检测 URL。"""
|
||||||
|
reader_cls = detect_file_type_light("https://example.com", readers)
|
||||||
|
assert reader_cls == HtmlReader
|
||||||
|
|
||||||
|
def test_detect_unknown(self, readers):
|
||||||
|
"""测试检测未知文件类型。"""
|
||||||
|
reader_cls = detect_file_type_light("test.xyz", readers)
|
||||||
|
assert reader_cls is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetPlatform:
|
||||||
|
"""测试平台检测函数。"""
|
||||||
|
|
||||||
|
def test_get_platform_format(self):
|
||||||
|
"""测试平台标识格式正确。"""
|
||||||
|
platform_id = get_platform()
|
||||||
|
# 格式应该是 {system}-{machine}
|
||||||
|
assert "-" in platform_id
|
||||||
|
# 至少包含两个部分
|
||||||
|
assert len(platform_id.split("-")) >= 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetDependencies:
|
||||||
|
"""测试依赖获取函数。"""
|
||||||
|
|
||||||
|
def test_get_default_dependencies(self):
|
||||||
|
"""测试获取默认依赖配置。"""
|
||||||
|
python_ver, deps = get_dependencies(DocxReader, "Unknown-Platform")
|
||||||
|
assert python_ver is None
|
||||||
|
assert len(deps) > 0
|
||||||
|
# 检查是否有 docling 相关依赖(可能带版本号)
|
||||||
|
assert any(dep.startswith("docling") for dep in deps)
|
||||||
|
|
||||||
|
def test_get_pdf_dependencies(self):
|
||||||
|
"""测试获取 PDF 依赖。"""
|
||||||
|
python_ver, deps = get_dependencies(PdfReader, "Darwin-arm64")
|
||||||
|
assert python_ver is None
|
||||||
|
# 检查是否有 docling 相关依赖(可能带版本号)
|
||||||
|
assert any(dep.startswith("docling") for dep in deps)
|
||||||
|
|
||||||
|
def test_get_html_dependencies(self):
|
||||||
|
"""测试获取 HTML 依赖。"""
|
||||||
|
python_ver, deps = get_dependencies(HtmlReader, "Linux-x86_64")
|
||||||
|
assert python_ver is None
|
||||||
|
# 检查是否有 trafilatura 相关依赖(可能带版本号)
|
||||||
|
assert any(dep.startswith("trafilatura") for dep in deps)
|
||||||
|
|
||||||
|
|
||||||
|
class TestGenerateUvCommand:
|
||||||
|
"""测试 uv 命令生成函数。"""
|
||||||
|
|
||||||
|
def test_generate_simple_command(self):
|
||||||
|
"""测试生成简单的 uv 命令。"""
|
||||||
|
cmd = generate_uv_command(
|
||||||
|
["pkg1", "pkg2"],
|
||||||
|
"input.pdf",
|
||||||
|
script_path="scripts/lyxy_document_reader.py"
|
||||||
|
)
|
||||||
|
assert "uv run" in cmd
|
||||||
|
assert "--with pkg1" in cmd
|
||||||
|
assert "--with pkg2" in cmd
|
||||||
|
assert "input.pdf" in cmd
|
||||||
|
|
||||||
|
def test_generate_with_python_version(self):
|
||||||
|
"""测试生成带 python 版本的 uv 命令。"""
|
||||||
|
cmd = generate_uv_command(
|
||||||
|
["pkg1"],
|
||||||
|
"input.pdf",
|
||||||
|
python_version="3.12",
|
||||||
|
script_path="scripts/lyxy_document_reader.py"
|
||||||
|
)
|
||||||
|
assert "--python 3.12" in cmd
|
||||||
|
|
||||||
|
def test_generate_with_quoted_deps(self):
|
||||||
|
"""测试生成带引号的依赖(如 unstructured[pdf])。"""
|
||||||
|
cmd = generate_uv_command(
|
||||||
|
["unstructured[pdf]", "pkg2"],
|
||||||
|
"input.pdf",
|
||||||
|
script_path="scripts/lyxy_document_reader.py"
|
||||||
|
)
|
||||||
|
assert '--with "unstructured[pdf]"' in cmd
|
||||||
|
|
||||||
|
|
||||||
|
class TestGeneratePythonCommand:
|
||||||
|
"""测试 python 命令生成函数。"""
|
||||||
|
|
||||||
|
def test_generate_python_command(self):
|
||||||
|
"""测试生成 python 命令。"""
|
||||||
|
python_cmd, pip_cmd = generate_python_command(
|
||||||
|
["pkg1", "pkg2"],
|
||||||
|
"input.pdf",
|
||||||
|
script_path="scripts/lyxy_document_reader.py"
|
||||||
|
)
|
||||||
|
assert python_cmd == "python scripts/lyxy_document_reader.py input.pdf"
|
||||||
|
assert pip_cmd == "pip install pyarmor pkg1 pkg2"
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatAdvice:
|
||||||
|
"""测试建议格式化输出函数。"""
|
||||||
|
|
||||||
|
def test_format_without_platform(self):
|
||||||
|
"""测试无平台特殊配置的格式化输出。"""
|
||||||
|
output = format_advice(
|
||||||
|
"pdf",
|
||||||
|
"test.pdf",
|
||||||
|
"Darwin-arm64",
|
||||||
|
"uv run --with docling ...",
|
||||||
|
"python scripts/lyxy_document_reader.py test.pdf",
|
||||||
|
"pip install docling ...",
|
||||||
|
has_platform_specific=False
|
||||||
|
)
|
||||||
|
assert "文件类型: PDF" in output
|
||||||
|
assert "输入路径: test.pdf" in output
|
||||||
|
assert "平台:" not in output
|
||||||
|
assert "[uv 命令]" in output
|
||||||
|
assert "[python 命令]" in output
|
||||||
|
|
||||||
|
def test_format_with_platform(self):
|
||||||
|
"""测试有平台特殊配置的格式化输出。"""
|
||||||
|
output = format_advice(
|
||||||
|
"pdf",
|
||||||
|
"test.pdf",
|
||||||
|
"Darwin-x86_64",
|
||||||
|
"uv run --python 3.12 ...",
|
||||||
|
"python ...",
|
||||||
|
"pip install ...",
|
||||||
|
has_platform_specific=True
|
||||||
|
)
|
||||||
|
assert "平台: Darwin-x86_64" in output
|
||||||
|
|
||||||
|
|
||||||
|
class TestGenerateAdvice:
|
||||||
|
"""测试完整建议生成函数。"""
|
||||||
|
|
||||||
|
def test_generate_advice_pdf(self, readers):
|
||||||
|
"""测试生成 PDF 的建议。"""
|
||||||
|
advice = generate_advice("test.pdf", readers, "scripts/lyxy_document_reader.py")
|
||||||
|
assert advice is not None
|
||||||
|
assert "文件类型: PDF" in advice
|
||||||
|
assert "[uv 命令]" in advice
|
||||||
|
assert "[python 命令]" in advice
|
||||||
|
|
||||||
|
def test_generate_advice_url(self, readers):
|
||||||
|
"""测试生成 URL 的建议。"""
|
||||||
|
advice = generate_advice(
|
||||||
|
"https://example.com",
|
||||||
|
readers,
|
||||||
|
"scripts/lyxy_document_reader.py"
|
||||||
|
)
|
||||||
|
assert advice is not None
|
||||||
|
assert "文件类型: HTML" in advice
|
||||||
|
|
||||||
|
def test_generate_advice_unknown(self, readers):
|
||||||
|
"""测试生成未知类型的建议。"""
|
||||||
|
advice = generate_advice("test.xyz", readers, "scripts/lyxy_document_reader.py")
|
||||||
|
assert advice is None
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
"""测试 Markdown 工具函数。"""
|
"""测试 Markdown 工具函数。"""
|
||||||
|
|
||||||
from scripts.core import (
|
from core import (
|
||||||
get_heading_level,
|
get_heading_level,
|
||||||
extract_titles,
|
extract_titles,
|
||||||
normalize_markdown_whitespace,
|
normalize_markdown_whitespace,
|
||||||
|
|||||||
233
tests/test_core/test_markdown_extra.py
Normal file
233
tests/test_core/test_markdown_extra.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
"""测试 markdown 模块的高级功能(extract_title_content, search_markdown)。"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.markdown import extract_title_content, search_markdown
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractTitleContent:
|
||||||
|
"""测试 extract_title_content 函数。"""
|
||||||
|
|
||||||
|
def test_extract_simple_title(self):
|
||||||
|
"""测试提取简单标题。"""
|
||||||
|
markdown = """# 目标标题
|
||||||
|
|
||||||
|
这是标题下的内容。
|
||||||
|
第二段内容。"""
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "目标标题")
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "# 目标标题" in result
|
||||||
|
assert "这是标题下的内容" in result
|
||||||
|
|
||||||
|
def test_extract_with_subtitles(self):
|
||||||
|
"""测试提取包含子标题的内容。"""
|
||||||
|
markdown = """# 目标标题
|
||||||
|
|
||||||
|
这是标题下的内容。
|
||||||
|
|
||||||
|
## 子标题
|
||||||
|
|
||||||
|
子标题下的内容。
|
||||||
|
|
||||||
|
### 孙子标题
|
||||||
|
|
||||||
|
更深层的内容。"""
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "目标标题")
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "# 目标标题" in result
|
||||||
|
assert "## 子标题" in result
|
||||||
|
assert "### 孙子标题" in result
|
||||||
|
|
||||||
|
def test_extract_stop_at_sibling_title(self):
|
||||||
|
"""测试在同级标题处停止。"""
|
||||||
|
markdown = """# 目标标题
|
||||||
|
|
||||||
|
目标内容。
|
||||||
|
|
||||||
|
# 另一个标题
|
||||||
|
|
||||||
|
另一个内容。"""
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "目标标题")
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "# 目标标题" in result
|
||||||
|
assert "目标内容" in result
|
||||||
|
assert "# 另一个标题" not in result
|
||||||
|
|
||||||
|
def test_extract_with_parent_titles(self):
|
||||||
|
"""测试包含父级标题。"""
|
||||||
|
markdown = """# 父级标题
|
||||||
|
|
||||||
|
父级内容。
|
||||||
|
|
||||||
|
## 目标标题
|
||||||
|
|
||||||
|
目标内容。
|
||||||
|
|
||||||
|
### 子标题
|
||||||
|
|
||||||
|
子内容。"""
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "目标标题")
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "# 父级标题" in result
|
||||||
|
assert "## 目标标题" in result
|
||||||
|
assert "### 子标题" in result
|
||||||
|
|
||||||
|
def test_extract_multiple_matches(self):
|
||||||
|
"""测试多个匹配标题的情况。"""
|
||||||
|
markdown = """# 第一章
|
||||||
|
|
||||||
|
## 目标标题
|
||||||
|
|
||||||
|
第一章的目标内容。
|
||||||
|
|
||||||
|
# 第二章
|
||||||
|
|
||||||
|
## 目标标题
|
||||||
|
|
||||||
|
第二章的目标内容。"""
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "目标标题")
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "第一章的目标内容" in result
|
||||||
|
assert "第二章的目标内容" in result
|
||||||
|
assert "---" in result
|
||||||
|
|
||||||
|
def test_title_not_found(self):
|
||||||
|
"""测试标题不存在的情况。"""
|
||||||
|
markdown = "# 其他标题\n内容"
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "不存在的标题")
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_deep_nested_title(self):
|
||||||
|
"""测试深层嵌套标题。"""
|
||||||
|
markdown = """# H1
|
||||||
|
|
||||||
|
## H2
|
||||||
|
|
||||||
|
### H3
|
||||||
|
|
||||||
|
#### 目标标题
|
||||||
|
|
||||||
|
目标内容。"""
|
||||||
|
|
||||||
|
result = extract_title_content(markdown, "目标标题")
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "# H1" in result
|
||||||
|
assert "## H2" in result
|
||||||
|
assert "### H3" in result
|
||||||
|
assert "#### 目标标题" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestSearchMarkdown:
|
||||||
|
"""测试 search_markdown 函数。"""
|
||||||
|
|
||||||
|
def test_search_simple_pattern(self):
|
||||||
|
"""测试简单搜索模式。"""
|
||||||
|
content = """第一行
|
||||||
|
第二行
|
||||||
|
包含关键词的行
|
||||||
|
第四行"""
|
||||||
|
|
||||||
|
result = search_markdown(content, "关键词", context_lines=0)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "关键词" in result
|
||||||
|
|
||||||
|
def test_search_with_context(self):
|
||||||
|
"""测试带上下文的搜索。"""
|
||||||
|
content = """行1
|
||||||
|
行2
|
||||||
|
关键词行
|
||||||
|
行4
|
||||||
|
行5"""
|
||||||
|
|
||||||
|
result = search_markdown(content, "关键词", context_lines=1)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "关键词" in result
|
||||||
|
assert "行2" in result or "行4" in result
|
||||||
|
|
||||||
|
def test_search_no_match(self):
|
||||||
|
"""测试无匹配的情况。"""
|
||||||
|
content = "普通内容"
|
||||||
|
|
||||||
|
result = search_markdown(content, "不存在的内容", context_lines=0)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_search_empty_content(self):
|
||||||
|
"""测试空内容。"""
|
||||||
|
result = search_markdown("", "关键词", context_lines=0)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_search_invalid_regex(self):
|
||||||
|
"""测试无效正则表达式。"""
|
||||||
|
content = "内容"
|
||||||
|
|
||||||
|
result = search_markdown(content, "[invalid", context_lines=0)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_search_negative_context(self):
|
||||||
|
"""测试负的上下文行数。"""
|
||||||
|
content = "内容"
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
search_markdown(content, "内容", context_lines=-1)
|
||||||
|
|
||||||
|
def test_search_multiple_matches_merged(self):
|
||||||
|
"""测试多个匹配合并。"""
|
||||||
|
content = """行1
|
||||||
|
行2
|
||||||
|
匹配1
|
||||||
|
行4
|
||||||
|
行5
|
||||||
|
匹配2
|
||||||
|
行7
|
||||||
|
行8"""
|
||||||
|
|
||||||
|
result = search_markdown(content, "匹配", context_lines=1)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "匹配1" in result
|
||||||
|
assert "匹配2" in result
|
||||||
|
|
||||||
|
def test_search_ignore_blank_lines_in_context(self):
|
||||||
|
"""测试上下文计算忽略空行。"""
|
||||||
|
content = """行1
|
||||||
|
|
||||||
|
行2
|
||||||
|
关键词
|
||||||
|
|
||||||
|
行4
|
||||||
|
行5"""
|
||||||
|
|
||||||
|
result = search_markdown(content, "关键词", context_lines=1)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "关键词" in result
|
||||||
|
|
||||||
|
def test_search_with_regex(self):
|
||||||
|
"""测试使用正则表达式搜索。"""
|
||||||
|
content = """apple
|
||||||
|
banana
|
||||||
|
cherry
|
||||||
|
date"""
|
||||||
|
|
||||||
|
result = search_markdown(content, "^b", context_lines=0)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "banana" in result
|
||||||
256
tests/test_core/test_parser.py
Normal file
256
tests/test_core/test_parser.py
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
"""测试 parser 模块的解析调度功能。"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from core.parser import parse_input, process_content, output_result
|
||||||
|
from core.exceptions import FileDetectionError, ReaderNotFoundError
|
||||||
|
|
||||||
|
|
||||||
|
class MockReader:
|
||||||
|
"""模拟 Reader 类用于测试。"""
|
||||||
|
|
||||||
|
def __init__(self, supports=True, content=None, failures=None):
|
||||||
|
self._supports = supports
|
||||||
|
self._content = content
|
||||||
|
self._failures = failures or []
|
||||||
|
|
||||||
|
def supports(self, file_path):
|
||||||
|
return self._supports
|
||||||
|
|
||||||
|
def parse(self, file_path):
|
||||||
|
return self._content, self._failures
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseInput:
|
||||||
|
"""测试 parse_input 函数。"""
|
||||||
|
|
||||||
|
def test_parse_input_success(self):
|
||||||
|
"""测试成功解析的情况。"""
|
||||||
|
reader = MockReader(supports=True, content="测试内容", failures=[])
|
||||||
|
readers = [reader]
|
||||||
|
|
||||||
|
content, failures = parse_input("test.docx", readers)
|
||||||
|
|
||||||
|
assert content == "测试内容"
|
||||||
|
assert failures == []
|
||||||
|
|
||||||
|
def test_parse_input_reader_not_found(self):
|
||||||
|
"""测试没有找到支持的 reader。"""
|
||||||
|
reader = MockReader(supports=False)
|
||||||
|
readers = [reader]
|
||||||
|
|
||||||
|
with pytest.raises(ReaderNotFoundError):
|
||||||
|
parse_input("test.docx", readers)
|
||||||
|
|
||||||
|
def test_parse_input_empty_path(self):
|
||||||
|
"""测试空输入路径。"""
|
||||||
|
readers = [MockReader()]
|
||||||
|
|
||||||
|
with pytest.raises(FileDetectionError):
|
||||||
|
parse_input("", readers)
|
||||||
|
|
||||||
|
def test_parse_input_multiple_readers_first_succeeds(self):
|
||||||
|
"""测试多个 reader,第一个成功。"""
|
||||||
|
reader1 = MockReader(supports=True, content="第一个结果", failures=[])
|
||||||
|
reader2 = MockReader(supports=True, content="第二个结果", failures=[])
|
||||||
|
readers = [reader1, reader2]
|
||||||
|
|
||||||
|
content, failures = parse_input("test.docx", readers)
|
||||||
|
|
||||||
|
assert content == "第一个结果"
|
||||||
|
|
||||||
|
def test_parse_input_with_failures(self):
|
||||||
|
"""测试解析返回失败信息。"""
|
||||||
|
reader = MockReader(
|
||||||
|
supports=True,
|
||||||
|
content=None,
|
||||||
|
failures=["解析器1失败", "解析器2失败"]
|
||||||
|
)
|
||||||
|
readers = [reader]
|
||||||
|
|
||||||
|
content, failures = parse_input("test.docx", readers)
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
assert failures == ["解析器1失败", "解析器2失败"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestProcessContent:
|
||||||
|
"""测试 process_content 函数。"""
|
||||||
|
|
||||||
|
def test_process_content_removes_images(self):
|
||||||
|
"""测试移除图片标记。"""
|
||||||
|
content = "测试内容  更多内容"
|
||||||
|
result = process_content(content)
|
||||||
|
|
||||||
|
assert "" not in result
|
||||||
|
assert "测试内容" in result
|
||||||
|
assert "更多内容" in result
|
||||||
|
|
||||||
|
def test_process_content_normalizes_whitespace(self):
|
||||||
|
"""测试规范化空白字符。"""
|
||||||
|
content = "line1\n\n\n\nline2\n\n\nline3"
|
||||||
|
result = process_content(content)
|
||||||
|
|
||||||
|
assert "line1\n\nline2\n\nline3" in result
|
||||||
|
|
||||||
|
def test_process_content_both_operations(self):
|
||||||
|
"""测试同时执行两个操作。"""
|
||||||
|
content = "\n\n\n\n正文"
|
||||||
|
result = process_content(content)
|
||||||
|
|
||||||
|
assert "" not in result
|
||||||
|
assert "\n\n\n\n" not in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestOutputResult:
|
||||||
|
"""测试 output_result 函数。"""
|
||||||
|
|
||||||
|
def test_output_default(self, capsys):
|
||||||
|
"""测试默认输出内容。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=False,
|
||||||
|
titles=False,
|
||||||
|
title_content=None,
|
||||||
|
search=None,
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
output_result("测试内容", args)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "测试内容" in captured.out
|
||||||
|
|
||||||
|
def test_output_count(self, capsys):
|
||||||
|
"""测试字数统计。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=True,
|
||||||
|
lines=False,
|
||||||
|
titles=False,
|
||||||
|
title_content=None,
|
||||||
|
search=None,
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
output_result("测试内容", args)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert captured.out.strip() == "4"
|
||||||
|
|
||||||
|
def test_output_lines(self, capsys):
|
||||||
|
"""测试行数统计。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=True,
|
||||||
|
titles=False,
|
||||||
|
title_content=None,
|
||||||
|
search=None,
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
output_result("line1\nline2\nline3", args)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert captured.out.strip() == "3"
|
||||||
|
|
||||||
|
def test_output_titles(self, capsys):
|
||||||
|
"""测试提取标题。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=False,
|
||||||
|
titles=True,
|
||||||
|
title_content=None,
|
||||||
|
search=None,
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
content = "# 标题1\n正文\n## 标题2\n正文"
|
||||||
|
output_result(content, args)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "# 标题1" in captured.out
|
||||||
|
assert "## 标题2" in captured.out
|
||||||
|
|
||||||
|
def test_output_title_content_found(self, capsys):
|
||||||
|
"""测试提取标题内容(找到)。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=False,
|
||||||
|
titles=False,
|
||||||
|
title_content="目标标题",
|
||||||
|
search=None,
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
content = "# 目标标题\n标题下的内容"
|
||||||
|
|
||||||
|
with patch("sys.exit") as mock_exit:
|
||||||
|
output_result(content, args)
|
||||||
|
mock_exit.assert_not_called()
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "目标标题" in captured.out
|
||||||
|
assert "标题下的内容" in captured.out
|
||||||
|
|
||||||
|
def test_output_title_content_not_found(self, capsys):
|
||||||
|
"""测试提取标题内容(未找到)。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=False,
|
||||||
|
titles=False,
|
||||||
|
title_content="不存在的标题",
|
||||||
|
search=None,
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
content = "# 标题1\n内容"
|
||||||
|
|
||||||
|
with patch("sys.exit") as mock_exit:
|
||||||
|
output_result(content, args)
|
||||||
|
mock_exit.assert_called_once_with(1)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "未找到" in captured.out or "错误" in captured.out
|
||||||
|
|
||||||
|
def test_output_search_found(self, capsys):
|
||||||
|
"""测试搜索功能(找到)。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=False,
|
||||||
|
titles=False,
|
||||||
|
title_content=None,
|
||||||
|
search="关键词",
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
content = "行1\n行2\n包含关键词的行\n行4\n行5"
|
||||||
|
|
||||||
|
with patch("sys.exit") as mock_exit:
|
||||||
|
output_result(content, args)
|
||||||
|
mock_exit.assert_not_called()
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "关键词" in captured.out
|
||||||
|
|
||||||
|
def test_output_search_not_found(self, capsys):
|
||||||
|
"""测试搜索功能(未找到)。"""
|
||||||
|
args = argparse.Namespace(
|
||||||
|
count=False,
|
||||||
|
lines=False,
|
||||||
|
titles=False,
|
||||||
|
title_content=None,
|
||||||
|
search="不存在的内容",
|
||||||
|
context=2
|
||||||
|
)
|
||||||
|
|
||||||
|
content = "普通内容"
|
||||||
|
|
||||||
|
with patch("sys.exit") as mock_exit:
|
||||||
|
output_result(content, args)
|
||||||
|
mock_exit.assert_called_once_with(1)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "未找到" in captured.out or "错误" in captured.out
|
||||||
@@ -4,194 +4,85 @@ import pytest
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
# 静态测试文件目录
|
||||||
def temp_docx(tmp_path):
|
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
||||||
"""创建临时 DOCX 文件的 fixture 工厂。
|
|
||||||
|
|
||||||
Args:
|
|
||||||
paragraphs: 段落文本列表
|
|
||||||
headings: 标题列表,格式为 [(level, text), ...]
|
|
||||||
table_data: 表格数据,格式为 [[cell1, cell2], [cell3, cell4]]
|
|
||||||
list_items: 列表项列表
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: 临时文件路径
|
|
||||||
"""
|
|
||||||
def _create_docx(paragraphs=None, headings=None, table_data=None, list_items=None):
|
|
||||||
try:
|
|
||||||
from docx import Document
|
|
||||||
except ImportError:
|
|
||||||
pytest.skip("python-docx 未安装")
|
|
||||||
|
|
||||||
doc = Document()
|
|
||||||
|
|
||||||
# 添加标题
|
|
||||||
if headings:
|
|
||||||
for level, text in headings:
|
|
||||||
doc.add_heading(text, level=level)
|
|
||||||
|
|
||||||
# 添加段落
|
|
||||||
if paragraphs:
|
|
||||||
for para_text in paragraphs:
|
|
||||||
doc.add_paragraph(para_text)
|
|
||||||
|
|
||||||
# 添加表格
|
|
||||||
if table_data:
|
|
||||||
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
|
||||||
for i, row_data in enumerate(table_data):
|
|
||||||
for j, cell_text in enumerate(row_data):
|
|
||||||
table.rows[i].cells[j].text = str(cell_text)
|
|
||||||
|
|
||||||
# 添加列表项
|
|
||||||
if list_items:
|
|
||||||
for item in list_items:
|
|
||||||
doc.add_paragraph(item, style='List Bullet')
|
|
||||||
|
|
||||||
file_path = tmp_path / "test.docx"
|
|
||||||
doc.save(str(file_path))
|
|
||||||
return str(file_path)
|
|
||||||
|
|
||||||
return _create_docx
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def temp_pdf(tmp_path):
|
def doc_fixture_path():
|
||||||
"""创建临时 PDF 文件的 fixture 工厂。
|
"""返回 DOC 静态测试文件目录"""
|
||||||
|
return FIXTURES_DIR / "doc"
|
||||||
Args:
|
|
||||||
text: PDF 文本内容
|
|
||||||
lines: 文本行列表
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: 临时文件路径
|
|
||||||
"""
|
|
||||||
def _create_pdf(text=None, lines=None):
|
|
||||||
try:
|
|
||||||
from reportlab.pdfgen import canvas
|
|
||||||
from reportlab.lib.pagesizes import letter
|
|
||||||
from reportlab.pdfbase import pdfmetrics
|
|
||||||
from reportlab.pdfbase.ttfonts import TTFont
|
|
||||||
except ImportError:
|
|
||||||
pytest.skip("reportlab 未安装")
|
|
||||||
|
|
||||||
file_path = tmp_path / "test.pdf"
|
|
||||||
c = canvas.Canvas(str(file_path), pagesize=letter)
|
|
||||||
|
|
||||||
# 尝试注册中文字体(如果可用)
|
|
||||||
try:
|
|
||||||
# 使用系统字体
|
|
||||||
pdfmetrics.registerFont(TTFont('SimSun', 'simsun.ttc'))
|
|
||||||
c.setFont('SimSun', 12)
|
|
||||||
except:
|
|
||||||
# 回退到默认字体
|
|
||||||
c.setFont('Helvetica', 12)
|
|
||||||
|
|
||||||
y_position = 750
|
|
||||||
|
|
||||||
if text:
|
|
||||||
# 单个文本块
|
|
||||||
for line in text.split('\n'):
|
|
||||||
c.drawString(100, y_position, line)
|
|
||||||
y_position -= 20
|
|
||||||
|
|
||||||
if lines:
|
|
||||||
# 多行文本
|
|
||||||
for line in lines:
|
|
||||||
c.drawString(100, y_position, line)
|
|
||||||
y_position -= 20
|
|
||||||
|
|
||||||
c.save()
|
|
||||||
return str(file_path)
|
|
||||||
|
|
||||||
return _create_pdf
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def temp_html(tmp_path):
|
def xls_fixture_path():
|
||||||
"""创建临时 HTML 文件的 fixture 工厂。
|
"""返回 XLS 静态测试文件目录"""
|
||||||
|
return FIXTURES_DIR / "xls"
|
||||||
Args:
|
|
||||||
content: HTML 内容字符串
|
|
||||||
encoding: 文件编码,默认 'utf-8'
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: 临时文件路径
|
|
||||||
"""
|
|
||||||
def _create_html(content="<html><body><p>Test</p></body></html>", encoding='utf-8'):
|
|
||||||
file_path = tmp_path / "test.html"
|
|
||||||
|
|
||||||
# 如果内容不包含完整的 HTML 结构,添加基本结构
|
|
||||||
if not content.strip().startswith('<html'):
|
|
||||||
content = f"<html><head><meta charset='{encoding}'></head><body>{content}</body></html>"
|
|
||||||
|
|
||||||
with open(file_path, 'w', encoding=encoding) as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
return str(file_path)
|
|
||||||
|
|
||||||
return _create_html
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def temp_pptx(tmp_path):
|
def ppt_fixture_path():
|
||||||
"""创建临时 PPTX 文件的 fixture 工厂。
|
"""返回 PPT 静态测试文件目录"""
|
||||||
|
return FIXTURES_DIR / "ppt"
|
||||||
|
|
||||||
Args:
|
|
||||||
slides: 幻灯片内容列表,每个元素为 (title, content) 元组
|
|
||||||
|
|
||||||
Returns:
|
def _get_static_file_path(fixture_dir, filename):
|
||||||
str: 临时文件路径
|
"""获取静态文件路径,不存在时跳过测试"""
|
||||||
"""
|
file_path = fixture_dir / filename
|
||||||
def _create_pptx(slides=None):
|
if not file_path.exists():
|
||||||
try:
|
pytest.skip(f"静态测试文件不存在: {file_path}")
|
||||||
from pptx import Presentation
|
|
||||||
except ImportError:
|
|
||||||
pytest.skip("python-pptx 未安装")
|
|
||||||
|
|
||||||
prs = Presentation()
|
|
||||||
|
|
||||||
if slides:
|
|
||||||
for title, content in slides:
|
|
||||||
slide = prs.slides.add_slide(prs.slide_layouts[1]) # Title and Content layout
|
|
||||||
slide.shapes.title.text = title
|
|
||||||
if content:
|
|
||||||
text_frame = slide.shapes.placeholders[1].text_frame
|
|
||||||
text_frame.text = content
|
|
||||||
|
|
||||||
file_path = tmp_path / "test.pptx"
|
|
||||||
prs.save(str(file_path))
|
|
||||||
return str(file_path)
|
return str(file_path)
|
||||||
|
|
||||||
return _create_pptx
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def temp_xlsx(tmp_path):
|
def simple_doc_path(doc_fixture_path):
|
||||||
"""创建临时 XLSX 文件的 fixture 工厂。
|
"""返回简单 DOC 测试文件路径"""
|
||||||
|
return _get_static_file_path(doc_fixture_path, "simple.doc")
|
||||||
|
|
||||||
Args:
|
|
||||||
data: 表格数据,格式为 [[cell1, cell2], [cell3, cell4]]
|
|
||||||
|
|
||||||
Returns:
|
@pytest.fixture
|
||||||
str: 临时文件路径
|
def with_headings_doc_path(doc_fixture_path):
|
||||||
"""
|
"""返回带标题的 DOC 测试文件路径"""
|
||||||
def _create_xlsx(data=None):
|
return _get_static_file_path(doc_fixture_path, "with_headings.doc")
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
except ImportError:
|
|
||||||
pytest.skip("pandas 未安装")
|
|
||||||
|
|
||||||
file_path = tmp_path / "test.xlsx"
|
|
||||||
|
|
||||||
if data:
|
@pytest.fixture
|
||||||
df = pd.DataFrame(data)
|
def with_table_doc_path(doc_fixture_path):
|
||||||
df.to_excel(str(file_path), index=False, header=False)
|
"""返回带表格的 DOC 测试文件路径"""
|
||||||
else:
|
return _get_static_file_path(doc_fixture_path, "with_table.doc")
|
||||||
# 创建空的 Excel 文件
|
|
||||||
df = pd.DataFrame()
|
|
||||||
df.to_excel(str(file_path), index=False)
|
|
||||||
|
|
||||||
return str(file_path)
|
|
||||||
|
|
||||||
return _create_xlsx
|
@pytest.fixture
|
||||||
|
def simple_xls_path(xls_fixture_path):
|
||||||
|
"""返回简单 XLS 测试文件路径"""
|
||||||
|
return _get_static_file_path(xls_fixture_path, "simple.xls")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def multiple_sheets_xls_path(xls_fixture_path):
|
||||||
|
"""返回多工作表 XLS 测试文件路径"""
|
||||||
|
return _get_static_file_path(xls_fixture_path, "multiple_sheets.xls")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def with_formulas_xls_path(xls_fixture_path):
|
||||||
|
"""返回带公式 XLS 测试文件路径"""
|
||||||
|
return _get_static_file_path(xls_fixture_path, "with_formulas.xls")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def simple_ppt_path(ppt_fixture_path):
|
||||||
|
"""返回简单 PPT 测试文件路径"""
|
||||||
|
return _get_static_file_path(ppt_fixture_path, "simple.ppt")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def multiple_slides_ppt_path(ppt_fixture_path):
|
||||||
|
"""返回多幻灯片 PPT 测试文件路径"""
|
||||||
|
return _get_static_file_path(ppt_fixture_path, "multiple_slides.ppt")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def with_images_ppt_path(ppt_fixture_path):
|
||||||
|
"""返回带图片 PPT 测试文件路径"""
|
||||||
|
return _get_static_file_path(ppt_fixture_path, "with_images.ppt")
|
||||||
|
|||||||
BIN
tests/test_readers/fixtures/doc/simple.doc
LFS
Normal file
BIN
tests/test_readers/fixtures/doc/simple.doc
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/doc/with_headings.doc
LFS
Normal file
BIN
tests/test_readers/fixtures/doc/with_headings.doc
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/doc/with_table.doc
LFS
Normal file
BIN
tests/test_readers/fixtures/doc/with_table.doc
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/ppt/multiple_slides.ppt
LFS
Normal file
BIN
tests/test_readers/fixtures/ppt/multiple_slides.ppt
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/ppt/simple.ppt
LFS
Normal file
BIN
tests/test_readers/fixtures/ppt/simple.ppt
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/ppt/with_images.ppt
LFS
Normal file
BIN
tests/test_readers/fixtures/ppt/with_images.ppt
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/xls/multiple_sheets.xls
LFS
Normal file
BIN
tests/test_readers/fixtures/xls/multiple_sheets.xls
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/xls/simple.xls
LFS
Normal file
BIN
tests/test_readers/fixtures/xls/simple.xls
LFS
Normal file
Binary file not shown.
BIN
tests/test_readers/fixtures/xls/with_formulas.xls
LFS
Normal file
BIN
tests/test_readers/fixtures/xls/with_formulas.xls
LFS
Normal file
Binary file not shown.
1
tests/test_readers/test_doc/__init__.py
Normal file
1
tests/test_readers/test_doc/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""测试 DOC Reader 的解析功能。"""
|
||||||
25
tests/test_readers/test_doc/test_consistency.py
Normal file
25
tests/test_readers/test_doc/test_consistency.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
"""测试所有 DOC Readers 的一致性。"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from readers.doc import libreoffice
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocReadersConsistency:
|
||||||
|
"""验证所有 DOC Readers 解析同一文件时核心文字内容一致。"""
|
||||||
|
|
||||||
|
def test_parsers_importable(self):
|
||||||
|
"""测试所有 parser 模块可以正确导入。"""
|
||||||
|
# 验证模块导入成功
|
||||||
|
assert libreoffice is not None
|
||||||
|
assert hasattr(libreoffice, 'parse')
|
||||||
|
|
||||||
|
def test_parser_functions_callable(self):
|
||||||
|
"""测试 parse 函数是可调用的。"""
|
||||||
|
assert callable(libreoffice.parse)
|
||||||
|
|
||||||
|
def test_libreoffice_parse_simple_doc(self, simple_doc_path):
|
||||||
|
"""测试 LibreOffice 解析简单文件。"""
|
||||||
|
content, error = libreoffice.parse(simple_doc_path)
|
||||||
|
# LibreOffice 可能未安装,所以不强制断言成功
|
||||||
|
if content is not None:
|
||||||
|
assert content.strip() != ""
|
||||||
35
tests/test_readers/test_doc/test_libreoffice.py
Normal file
35
tests/test_readers/test_doc/test_libreoffice.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""测试 LibreOffice DOC Reader 的解析功能。"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from readers.doc import libreoffice
|
||||||
|
|
||||||
|
|
||||||
|
class TestLibreOfficeDocReaderParse:
|
||||||
|
"""测试 LibreOffice DOC Reader 的 parse 方法。"""
|
||||||
|
|
||||||
|
def test_simple_doc(self, simple_doc_path):
|
||||||
|
"""测试简单 DOC 文件解析。"""
|
||||||
|
content, error = libreoffice.parse(simple_doc_path)
|
||||||
|
if content is not None:
|
||||||
|
# 至少能解析出一些内容
|
||||||
|
assert content.strip() != ""
|
||||||
|
|
||||||
|
def test_with_headings_doc(self, with_headings_doc_path):
|
||||||
|
"""测试带标题的 DOC 文件解析。"""
|
||||||
|
content, error = libreoffice.parse(with_headings_doc_path)
|
||||||
|
if content is not None:
|
||||||
|
assert content.strip() != ""
|
||||||
|
|
||||||
|
def test_with_table_doc(self, with_table_doc_path):
|
||||||
|
"""测试带表格的 DOC 文件解析。"""
|
||||||
|
content, error = libreoffice.parse(with_table_doc_path)
|
||||||
|
if content is not None:
|
||||||
|
assert content.strip() != ""
|
||||||
|
|
||||||
|
def test_file_not_exists(self, tmp_path):
|
||||||
|
"""测试文件不存在的情况。"""
|
||||||
|
non_existent_file = str(tmp_path / "non_existent.doc")
|
||||||
|
content, error = libreoffice.parse(non_existent_file)
|
||||||
|
assert content is None
|
||||||
|
assert error is not None
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user