Compare commits
6 Commits
58093e0877
...
9daff73589
| Author | SHA1 | Date | |
|---|---|---|---|
| 9daff73589 | |||
| 6e75c99d5b | |||
| d860e17b2c | |||
| c140bda66b | |||
| dfe6904f4c | |||
| b2fb418a06 |
@@ -1,7 +1,6 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"WebSearch",
|
||||
"WebFetch(*)",
|
||||
"Bash(openspec:*)",
|
||||
"Bash(git:*)",
|
||||
@@ -12,6 +11,9 @@
|
||||
"mcp__context7__query-docs",
|
||||
"mcp__exa__web_search_exa",
|
||||
"mcp__exa__get_code_context_exa"
|
||||
],
|
||||
"deny": [
|
||||
"WebSearch"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -174,6 +174,9 @@ ipython_config.py
|
||||
# pipenv
|
||||
Pipfile.lock
|
||||
|
||||
# uv
|
||||
uv.lock
|
||||
|
||||
# PEP 582
|
||||
__pypackages__/
|
||||
|
||||
|
||||
141
README.md
141
README.md
@@ -4,9 +4,9 @@
|
||||
|
||||
## 开发环境
|
||||
|
||||
- 使用 uv 管理依赖,禁用主机 Python
|
||||
- 依赖声明:pyproject.toml
|
||||
- 安装:uv sync
|
||||
- 使用 uv 运行脚本和测试,禁用主机 Python
|
||||
- 依赖管理:使用 `uv run --with` 按需加载依赖
|
||||
- 依赖说明:详见 SKILL.md 的"依赖安装指南"章节
|
||||
|
||||
## 项目结构
|
||||
|
||||
@@ -22,25 +22,128 @@ skill/ # SKILL 文档
|
||||
|
||||
## 开发工作流
|
||||
|
||||
使用 `uv run --with` 方式运行测试和开发工具:
|
||||
|
||||
```bash
|
||||
# 运行测试
|
||||
uv run pytest
|
||||
# 运行测试(需要先安装 pytest)
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with pytest-cov \
|
||||
--with chardet \
|
||||
pytest
|
||||
|
||||
# 运行测试并查看覆盖率
|
||||
uv run pytest --cov=scripts --cov-report=term-missing
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with pytest-cov \
|
||||
--with chardet \
|
||||
pytest --cov=scripts --cov-report=term-missing
|
||||
|
||||
# 运行特定测试文件
|
||||
uv run pytest tests/test_readers/test_docx/
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_docx/
|
||||
|
||||
# 运行特定测试类或方法
|
||||
uv run pytest tests/test_cli/test_main.py::TestCLIDefaultOutput::test_default_output_docx
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with chardet \
|
||||
pytest tests/test_cli/test_main.py::TestCLIDefaultOutput::test_default_output_docx
|
||||
|
||||
# 代码格式化
|
||||
uv run black .
|
||||
uv run isort .
|
||||
uv run \
|
||||
--with black \
|
||||
--with isort \
|
||||
--with chardet \
|
||||
bash -c "black . && isort ."
|
||||
|
||||
# 类型检查
|
||||
uv run mypy .
|
||||
uv run \
|
||||
--with mypy \
|
||||
--with chardet \
|
||||
mypy .
|
||||
```
|
||||
|
||||
**测试 DOCX reader**:
|
||||
|
||||
```bash
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with docling \
|
||||
--with "unstructured[docx]" \
|
||||
--with "markitdown[docx]" \
|
||||
--with pypandoc-binary \
|
||||
--with python-docx \
|
||||
--with markdownify \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_docx/
|
||||
```
|
||||
|
||||
**测试 PDF reader**:
|
||||
|
||||
```bash
|
||||
# 默认命令(macOS ARM、Linux、Windows)
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with docling \
|
||||
--with "unstructured[pdf]" \
|
||||
--with "markitdown[pdf]" \
|
||||
--with pypdf \
|
||||
--with markdownify \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_pdf/
|
||||
|
||||
# macOS x86_64 (Intel) 特殊命令
|
||||
uv run \
|
||||
--python 3.12 \
|
||||
--with pytest \
|
||||
--with "docling==2.40.0" \
|
||||
--with "docling-parse==4.0.0" \
|
||||
--with "numpy<2" \
|
||||
--with "markitdown[pdf]" \
|
||||
--with pypdf \
|
||||
--with markdownify \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_pdf/
|
||||
```
|
||||
|
||||
**测试其他格式**:
|
||||
|
||||
```bash
|
||||
# XLSX reader
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with docling \
|
||||
--with "unstructured[xlsx]" \
|
||||
--with "markitdown[xlsx]" \
|
||||
--with pandas \
|
||||
--with tabulate \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_xlsx/
|
||||
|
||||
# PPTX reader
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with docling \
|
||||
--with "unstructured[pptx]" \
|
||||
--with "markitdown[pptx]" \
|
||||
--with python-pptx \
|
||||
--with markdownify \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_pptx/
|
||||
|
||||
# HTML reader
|
||||
uv run \
|
||||
--with pytest \
|
||||
--with trafilatura \
|
||||
--with domscribe \
|
||||
--with markitdown \
|
||||
--with html2text \
|
||||
--with beautifulsoup4 \
|
||||
--with httpx \
|
||||
--with chardet \
|
||||
pytest tests/test_readers/test_html/
|
||||
```
|
||||
|
||||
## 测试
|
||||
@@ -57,10 +160,8 @@ uv run mypy .
|
||||
- 编码测试(GBK、UTF-8 BOM 等)
|
||||
- 一致性测试(验证不同 Reader 解析结果的一致性)
|
||||
|
||||
运行测试前确保已安装所有依赖:
|
||||
```bash
|
||||
uv sync
|
||||
```
|
||||
运行测试前,请根据测试类型使用 `uv run --with` 安装对应的依赖包。详见上方的"开发工作流"章节和 SKILL.md 的"依赖安装指南"。
|
||||
|
||||
|
||||
## 代码规范
|
||||
|
||||
@@ -91,16 +192,12 @@ skill/SKILL.md 面向 AI 用户,必须遵循 Claude Skill 构建指南的最
|
||||
6. **错误处理**: 常见错误及解决方案
|
||||
7. **References**: 指向项目文档的链接
|
||||
|
||||
### 双路径执行策略
|
||||
|
||||
- **优先**: 使用 lyxy-runner-python skill(自动管理依赖)
|
||||
- **回退**: 主机 Python 环境(需手动安装依赖)
|
||||
|
||||
### 依赖说明
|
||||
### 依赖管理
|
||||
|
||||
- 使用 `uv run --with` 方式按需加载依赖
|
||||
- 必须使用具体的 pip 包名
|
||||
- 不能使用 lyxy-document[xxx] 形式(发布时没有 pyproject.toml)
|
||||
- 按文档类型分组说明
|
||||
- 详见 SKILL.md 的"依赖安装指南"章节
|
||||
|
||||
## 解析器架构
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ license: MIT
|
||||
metadata:
|
||||
version: "1.0"
|
||||
author: lyxy
|
||||
compatibility: Requires Python 3.11+. 优先使用 lyxy-runner-python skill 执行(自动管理依赖)。回退到主机 Python 时需手动安装依赖:DOCX(docling unstructured markitdown pypandoc-binary python-docx markdownify chardet) / XLSX(docling unstructured markitdown pandas tabulate chardet) / PPTX(docling unstructured markitdown python-pptx markdownify chardet) / PDF(docling unstructured unstructured-paddleocr markitdown pypdf markdownify chardet) / HTML(trafilatura domscribe markitdown html2text beautifulsoup4 httpx chardet) / HTTP增强(pyppeteer selenium)
|
||||
compatibility: Requires Python 3.11+. 使用 uv run --with 方式按需加载依赖,详见"依赖安装指南"章节。
|
||||
---
|
||||
|
||||
# 统一文档解析 Skill
|
||||
@@ -16,7 +16,7 @@ compatibility: Requires Python 3.11+. 优先使用 lyxy-runner-python skill 执
|
||||
|
||||
**统一入口**:使用 `scripts/lyxy_document_reader.py` 作为统一的命令行入口,自动识别文件类型并执行解析。
|
||||
|
||||
**双路径执行**:此 skill 必须优先使用 **lyxy-runner-python skill** 执行脚本,该 skill 会自动管理 uv 隔离环境和依赖。当 lyxy-runner-python 不可用时,回退到主机 Python 环境执行。
|
||||
**依赖管理**:使用 `uv run --with` 方式按需加载解析器依赖。每次执行时根据文档类型指定对应的依赖包。
|
||||
|
||||
**支持的文档类型**:
|
||||
- **DOCX**:Word 文档
|
||||
@@ -78,17 +78,16 @@ compatibility: Requires Python 3.11+. 优先使用 lyxy-runner-python skill 执
|
||||
|
||||
### 基本语法
|
||||
|
||||
```bash
|
||||
# 方式 1:使用 lyxy-runner-python(推荐)
|
||||
# lyxy-runner-python 会自动分析脚本依赖并使用 uv --with 安装
|
||||
# AI 只需执行:
|
||||
python scripts/lyxy_document_reader.py <文件路径或URL>
|
||||
使用 `uv run --with` 按需加载依赖包:
|
||||
|
||||
# 方式 2:回退到主机 Python(需要预先手动安装依赖)
|
||||
# 根据文档类型安装对应依赖后执行:
|
||||
python scripts/lyxy_document_reader.py <文件路径或URL>
|
||||
```bash
|
||||
# 根据文档类型选择对应的依赖包
|
||||
uv run --with <依赖包1> --with <依赖包2> ... \
|
||||
scripts/lyxy_document_reader.py <文件路径或URL>
|
||||
```
|
||||
|
||||
具体的依赖包列表请参考下方的"依赖安装指南"。
|
||||
|
||||
### 使用示例
|
||||
|
||||
```bash
|
||||
@@ -117,31 +116,72 @@ python scripts/lyxy_document_reader.py document.docx -s "\d{4}-\d{2}-\d{2}"
|
||||
python scripts/lyxy_document_reader.py document.docx -s "关键词" -n 5
|
||||
```
|
||||
|
||||
### 主机 Python 环境依赖安装
|
||||
### 依赖安装指南
|
||||
|
||||
当 lyxy-runner-python 不可用时,需要根据文档类型手动安装依赖:
|
||||
使用 `uv run --with` 方式按需加载解析器依赖。以下命令适用于大多数平台(macOS ARM、Linux、Windows)。
|
||||
|
||||
#### 平台检测
|
||||
|
||||
在遇到问题时,可以检测你的平台:
|
||||
|
||||
```bash
|
||||
# DOCX 文档
|
||||
pip install docling unstructured markitdown pypandoc-binary python-docx markdownify chardet
|
||||
# macOS / Linux
|
||||
uname -m # 显示架构: x86_64 或 arm64
|
||||
uname -s # 显示系统: Darwin 或 Linux
|
||||
|
||||
# XLSX 表格
|
||||
pip install docling unstructured markitdown pandas tabulate chardet
|
||||
# Windows PowerShell
|
||||
$env:OS # 或检查环境变量
|
||||
|
||||
# PPTX 演示文稿
|
||||
pip install docling unstructured markitdown python-pptx markdownify chardet
|
||||
# Python 跨平台检测
|
||||
python -c "import platform; print(f'{platform.system()}-{platform.machine()}')"
|
||||
```
|
||||
|
||||
# PDF 文档
|
||||
pip install docling unstructured unstructured-paddleocr markitdown pypdf markdownify chardet
|
||||
#### PDF 解析
|
||||
|
||||
# HTML/URL 网页
|
||||
pip install trafilatura domscribe markitdown html2text beautifulsoup4 httpx chardet
|
||||
**默认命令**(适用于 macOS ARM、Linux、Windows):
|
||||
|
||||
# 网页(需要 JS 渲染时,额外添加)
|
||||
pip install pyppeteer selenium
|
||||
```bash
|
||||
uv run --with docling --with "unstructured[pdf]" --with "markitdown[pdf]" --with pypdf --with markdownify --with chardet scripts/lyxy_document_reader.py file.pdf
|
||||
```
|
||||
|
||||
# 安装所有文档类型支持
|
||||
pip install docling unstructured unstructured-paddleocr markitdown pypandoc-binary python-docx python-pptx pandas tabulate pypdf markdownify trafilatura domscribe html2text beautifulsoup4 httpx pyppeteer selenium chardet
|
||||
**macOS x86_64 (Intel) 特殊说明**:
|
||||
|
||||
此平台需要使用 Python 3.12 和特定版本的依赖:
|
||||
|
||||
```bash
|
||||
uv run --python 3.12 --with "docling==2.40.0" --with "docling-parse==4.0.0" --with "numpy<2" --with "markitdown[pdf]" --with pypdf --with markdownify --with chardet scripts/lyxy_document_reader.py file.pdf
|
||||
```
|
||||
|
||||
原因:`docling-parse` 5.x 无 x86_64 wheel,必须使用 4.0.0;`easyocr`(docling 的 OCR 后端)与 NumPy 2.x 不兼容。
|
||||
|
||||
#### DOCX 解析
|
||||
|
||||
```bash
|
||||
uv run --with docling --with "unstructured[docx]" --with "markitdown[docx]" --with pypandoc-binary --with python-docx --with markdownify --with chardet scripts/lyxy_document_reader.py file.docx
|
||||
```
|
||||
|
||||
#### XLSX 解析
|
||||
|
||||
```bash
|
||||
uv run --with docling --with "unstructured[xlsx]" --with "markitdown[xlsx]" --with pandas --with tabulate --with chardet scripts/lyxy_document_reader.py file.xlsx
|
||||
```
|
||||
|
||||
#### PPTX 解析
|
||||
|
||||
```bash
|
||||
uv run --with docling --with "unstructured[pptx]" --with "markitdown[pptx]" --with python-pptx --with markdownify --with chardet scripts/lyxy_document_reader.py file.pptx
|
||||
```
|
||||
|
||||
#### HTML/URL 解析
|
||||
|
||||
```bash
|
||||
uv run --with trafilatura --with domscribe --with markitdown --with html2text --with beautifulsoup4 --with httpx --with chardet scripts/lyxy_document_reader.py https://example.com
|
||||
```
|
||||
|
||||
**需要 JavaScript 渲染的网页**,额外添加:
|
||||
|
||||
```bash
|
||||
--with pyppeteer --with selenium
|
||||
```
|
||||
|
||||
## 错误处理
|
||||
127
build.py
127
build.py
@@ -1,11 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Skill 打包构建脚本
|
||||
将 skill/SKILL.md 和 scripts/ 目录打包到 build/ 目录
|
||||
|
||||
使用方式:
|
||||
# 开发模式 - 快速构建,不混淆
|
||||
uv run python build.py
|
||||
|
||||
# 发布模式 - 完整构建,PyArmor 混淆
|
||||
uv run --with pyarmor python build.py --obfuscate
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import subprocess
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@@ -88,20 +97,113 @@ def copy_scripts_dir(source_dir: str, target_dir: str) -> int:
|
||||
return file_count
|
||||
|
||||
|
||||
def obfuscate_scripts_dir(source_dir: str, target_dir: str) -> None:
|
||||
"""
|
||||
使用 PyArmor 混淆 scripts 目录
|
||||
|
||||
Args:
|
||||
source_dir: 源代码目录 (scripts/)
|
||||
target_dir: 目标构建目录 (build/)
|
||||
"""
|
||||
# 检查 pyarmor 是否可用
|
||||
try:
|
||||
__import__("pyarmor")
|
||||
except ImportError:
|
||||
print("""
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
错误: PyArmor 未安装
|
||||
|
||||
请使用以下命令启用混淆:
|
||||
|
||||
uv run --with pyarmor python build.py --obfuscate
|
||||
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
""")
|
||||
sys.exit(1)
|
||||
|
||||
# 临时目录
|
||||
temp_dir = os.path.join(target_dir, "temp_pyarmor")
|
||||
|
||||
# 清理已存在的临时目录
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
# PyArmor 命令 (Normal Mode)
|
||||
cmd = [
|
||||
"pyarmor",
|
||||
"gen",
|
||||
"--recursive",
|
||||
"-O", temp_dir,
|
||||
source_dir
|
||||
]
|
||||
|
||||
print(f" 执行: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"\nPyArmor 混淆失败:")
|
||||
print(f" 返回码: {e.returncode}")
|
||||
print(f" 标准输出: {e.stdout}")
|
||||
print(f" 错误输出: {e.stderr}")
|
||||
sys.exit(1)
|
||||
|
||||
# 移动混淆后的文件到最终位置
|
||||
for item in os.listdir(temp_dir):
|
||||
src = os.path.join(temp_dir, item)
|
||||
dst = os.path.join(target_dir, item)
|
||||
|
||||
if os.path.exists(dst):
|
||||
if os.path.isdir(dst):
|
||||
shutil.rmtree(dst)
|
||||
else:
|
||||
os.remove(dst)
|
||||
|
||||
shutil.move(src, dst)
|
||||
|
||||
# 清理临时目录
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
print(" 混淆完成")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""
|
||||
主函数:执行完整的打包流程
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Skill 打包构建",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
使用示例:
|
||||
# 开发模式 - 快速构建,不混淆
|
||||
uv run python build.py
|
||||
|
||||
# 发布模式 - 完整构建,PyArmor 混淆
|
||||
uv run --with pyarmor python build.py --obfuscate
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--obfuscate",
|
||||
action="store_true",
|
||||
help="使用 PyArmor 混淆代码 (需: uv run --with pyarmor)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Skill 打包构建")
|
||||
print("=" * 60)
|
||||
|
||||
# 路径配置
|
||||
project_root = os.path.dirname(os.path.abspath(__file__))
|
||||
skill_md_path = os.path.join(project_root, "skill", "SKILL.md")
|
||||
skill_md_path = os.path.join(project_root, "SKILL.md")
|
||||
scripts_source_dir = os.path.join(project_root, "scripts")
|
||||
build_dir = os.path.join(project_root, "build")
|
||||
scripts_target_dir = os.path.join(build_dir, "scripts")
|
||||
|
||||
# 生成时间戳
|
||||
version = generate_timestamp()
|
||||
@@ -116,16 +218,27 @@ def main() -> None:
|
||||
copy_skill_md(skill_md_path, build_dir)
|
||||
print()
|
||||
|
||||
# 复制 scripts 目录
|
||||
print("复制 scripts/ 目录(仅 .py 文件):")
|
||||
file_count = copy_scripts_dir(scripts_source_dir, scripts_target_dir)
|
||||
# 根据 --obfuscate 选择执行路径
|
||||
if args.obfuscate:
|
||||
print("────────────────────────────────────────")
|
||||
print(" 使用 PyArmor 混淆代码 (Normal Mode)")
|
||||
print("────────────────────────────────────────")
|
||||
obfuscate_scripts_dir(scripts_source_dir, build_dir)
|
||||
file_count = None
|
||||
else:
|
||||
scripts_target_dir = os.path.join(build_dir, "scripts")
|
||||
print("复制 scripts/ 目录(仅 .py 文件):")
|
||||
file_count = copy_scripts_dir(scripts_source_dir, scripts_target_dir)
|
||||
print()
|
||||
|
||||
# 完成信息
|
||||
print("=" * 60)
|
||||
print("构建完成!")
|
||||
print(f"版本号: {version}")
|
||||
print(f"复制文件数: {file_count}")
|
||||
if file_count is not None:
|
||||
print(f"复制文件数: {file_count}")
|
||||
else:
|
||||
print("混淆模式: 已生成 .pyx 和 pyarmor_runtime")
|
||||
print(f"输出目录: {build_dir}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
@@ -3,13 +3,13 @@ schema: spec-driven
|
||||
context: |
|
||||
# 项目规范
|
||||
- 语言: 仅中文(交流/注释/文档/代码)
|
||||
- Python: 始终用uv运行(脚本/临时命令uv run python -c); 禁用主机python/禁主机安装包
|
||||
- Python: 当前项目始终用uv运行(脚本/临时命令uv run python -c); 禁用主机python/禁主机安装包
|
||||
- 依赖: pyproject.toml声明,使用uv安装
|
||||
- 主机环境: 禁止污染配置,需操作须请求用户
|
||||
- 开发文档: README.md,每次迭代按需更新开发文档; 禁emoji/特殊字符
|
||||
- skill文档: skill/SKILL.md,每次迭代按需更新skill文档
|
||||
- skill文档: SKILL.md,每次迭代按需更新skill文档
|
||||
- 测试: 所有需求必须设计全面测试
|
||||
- 任务: 禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
|
||||
- 任务: 除非用户直接要求,禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
|
||||
- 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文
|
||||
- 项目阶段: 未上线,无用户,破坏性变更无需迁移说明
|
||||
- Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明
|
||||
@@ -17,9 +17,9 @@ context: |
|
||||
- 目标:统一文档解析工具,将DOCX/XLSX/PPTX/PDF/HTML/URL 转换为 Markdown,面向AI skill使用
|
||||
# 项目目录结构
|
||||
- scripts/: 核心代码目录
|
||||
- skill/: skill文档目录
|
||||
- tests/: 测试目录
|
||||
- openspec/: 规范文档目录
|
||||
- temp/: 开发临时文件目录
|
||||
- pyproject.toml: 项目配置
|
||||
- README.md: 项目开发文档
|
||||
- SKILL.md: skill文档
|
||||
|
||||
61
openspec/specs/multi-platform-dependencies/spec.md
Normal file
61
openspec/specs/multi-platform-dependencies/spec.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# 多平台依赖管理
|
||||
|
||||
## Purpose
|
||||
|
||||
为不同平台提供特定的依赖配置,解决平台特定的依赖兼容性问题(如 macOS x86_64 的 docling-parse 版本限制)。通过 `uv run --with` 方式按需加载依赖,在文档中提供平台特定的命令示例。
|
||||
|
||||
## Requirements
|
||||
|
||||
### Requirement: 平台检测文档
|
||||
系统必须在 SKILL.md 中提供平台检测方法和平台特定的 `uv run --with` 命令示例。
|
||||
|
||||
#### Scenario: 平台检测命令
|
||||
- **WHEN** 用户阅读 SKILL.md 中的多平台依赖安装指南
|
||||
- **THEN** 系统必须提供以下平台的检测命令:
|
||||
- macOS / Linux: `uname -m` 和 `uname -s`
|
||||
- Windows: PowerShell 环境变量检测
|
||||
- Python 跨平台检测: `import platform; print(f'{platform.system()}-{platform.machine()}')`
|
||||
|
||||
#### Scenario: macOS x86_64 特殊说明
|
||||
- **WHEN** 用户在 macOS x86_64 平台阅读 PDF 解析依赖的安装说明
|
||||
- **THEN** 系统必须明确说明以下特殊要求:
|
||||
- 必须使用 Python 3.12
|
||||
- `docling-parse` 5.x 无 x86_64 wheel,必须使用 4.0.0
|
||||
- 提供完整的 `uv run --python 3.12 --with "docling==2.40.0" --with "docling-parse==4.0.0" --with "numpy<2" ...` 命令示例
|
||||
|
||||
#### Scenario: 每个平台的运行命令
|
||||
- **WHEN** 用户阅读 SKILL.md
|
||||
- **THEN** 系统必须为每个平台(Windows/macOS Intel/macOS ARM/Linux)和每种文档格式提供清晰的 `uv run --with` 命令示例
|
||||
- **AND** 命令必须包含所有必需的依赖包
|
||||
|
||||
### Requirement: 平台检测文档
|
||||
系统必须在 `SKILL.md` 中提供平台检测方法和平台特定的安装指南。
|
||||
|
||||
#### Scenario: 平台检测命令
|
||||
- **WHEN** 用户阅读 `SKILL.md` 中的多平台依赖安装指南
|
||||
- **THEN** 系统必须提供以下平台的检测命令:
|
||||
- macOS / Linux: `uname -m` 和 `uname -s`
|
||||
- Windows: PowerShell 环境变量检测
|
||||
- Python 跨平台检测: `import platform; print(f'{platform.system()}-{platform.machine()}')`
|
||||
|
||||
#### Scenario: macOS x86_64 特殊说明
|
||||
- **WHEN** 用户在 macOS x86_64 平台阅读 PDF 解析依赖的安装说明
|
||||
- **THEN** 系统必须明确说明以下特殊要求:
|
||||
- 必须使用 Python 3.12
|
||||
- `docling-parse` 5.x 无 x86_64 wheel,必须使用 4.0.0
|
||||
|
||||
#### Scenario: 每个平台的安装命令
|
||||
- **WHEN** 用户阅读 `SKILL.md`
|
||||
- **THEN** 系统必须为每个平台(Windows/macOS Intel/macOS ARM/Linux)提供清晰的 `uv run` 命令示例
|
||||
|
||||
### Requirement: Lock 文件管理
|
||||
系统必须移除 `uv.lock` 文件,每次 `uv run` 都是全新的依赖解析。
|
||||
|
||||
#### Scenario: 移除 uv.lock 文件
|
||||
- **WHEN** 用户查看项目根目录
|
||||
- **THEN** 系统必须不包含 uv.lock 文件
|
||||
- **AND** 依赖版本由文档中的版本约束说明
|
||||
|
||||
#### Scenario: gitignore 配置(可选)
|
||||
- **WHEN** 用户查看项目的 `.gitignore` 文件
|
||||
- **THEN** 系统可以包含 `uv.lock` 条目以确保不会误提交(如果用户重新创建了 lock 文件)
|
||||
@@ -52,3 +52,50 @@
|
||||
#### Scenario: 显示构建信息
|
||||
- **WHEN** 构建成功完成
|
||||
- **THEN** 控制台输出版本号和构建文件清单
|
||||
|
||||
### Requirement: --obfuscate 参数支持
|
||||
系统 SHALL 支持 `--obfuscate` 命令行参数,用于启用代码混淆功能。
|
||||
|
||||
#### Scenario: 使用 --obfuscate 参数
|
||||
- **WHEN** 用户执行 `uv run --with pyarmor python build.py --obfuscate`
|
||||
- **THEN** 系统使用 PyArmor 对 scripts 目录代码进行混淆
|
||||
|
||||
#### Scenario: 不使用 --obfuscate 参数
|
||||
- **WHEN** 用户执行 `uv run python build.py`(不带 --obfuscate)
|
||||
- **THEN** 系统执行原有的复制行为,不进行混淆
|
||||
|
||||
### Requirement: PyArmor 混淆执行
|
||||
系统 SHALL 在 `--obfuscate` 模式下调用 PyArmor 工具对 scripts 目录进行混淆。
|
||||
|
||||
#### Scenario: PyArmor 成功执行
|
||||
- **WHEN** 启用 --obfuscate 且 PyArmor 可用
|
||||
- **THEN** 系统执行 pyarmor gen --recursive 命令
|
||||
|
||||
#### Scenario: 混淆后文件输出
|
||||
- **WHEN** PyArmor 混淆完成
|
||||
- **THEN** build/scripts/ 目录包含混淆后的文件
|
||||
|
||||
#### Scenario: pyarmor_runtime 包含
|
||||
- **WHEN** PyArmor 混淆完成
|
||||
- **THEN** build/scripts/ 目录包含 pyarmor_runtime_xxxxxx 子目录
|
||||
|
||||
### Requirement: PyArmor 未安装友好提示
|
||||
系统 SHALL 在 PyArmor 未安装时提供清晰的错误提示,引导用户正确使用 `uv run --with pyarmor`。
|
||||
|
||||
#### Scenario: PyArmor ImportError
|
||||
- **WHEN** 启用 --obfuscate 但未通过 --with pyarmor 加载
|
||||
- **THEN** 系统显示友好错误信息,提示正确命令
|
||||
|
||||
### Requirement: SKILL.md 保持明文
|
||||
系统 SHALL 在混淆模式下仍然将 SKILL.md 作为明文文件复制,不进行混淆。
|
||||
|
||||
#### Scenario: SKILL.md 保持明文
|
||||
- **WHEN** 启用 --obfuscate 执行构建
|
||||
- **THEN** build/SKILL.md 文件为明文,内容与原文件一致
|
||||
|
||||
### Requirement: 混淆错误处理
|
||||
系统 SHALL 在 PyArmor 混淆失败时捕获错误并显示详细信息。
|
||||
|
||||
#### Scenario: PyArmor 命令失败
|
||||
- **WHEN** pyarmor 命令执行返回非零退出码
|
||||
- **THEN** 系统显示退出码、标准输出和错误输出信息
|
||||
|
||||
77
openspec/specs/uv-with-dependency-management/spec.md
Normal file
77
openspec/specs/uv-with-dependency-management/spec.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# UV --with 依赖管理
|
||||
|
||||
## Purpose
|
||||
|
||||
基于文档的依赖管理方式,使用 `uv run --with` 按需加载依赖。移除 pyproject.toml 和 uv.lock,通过 SKILL.md 和 README.md 提供完整的依赖说明和命令示例。
|
||||
|
||||
## Requirements
|
||||
|
||||
### Requirement: 文档驱动的依赖声明
|
||||
系统必须在 SKILL.md 和 README.md 中明确说明每种文档格式和平台所需的依赖包。
|
||||
|
||||
#### Scenario: SKILL.md 包含完整的依赖命令
|
||||
- **WHEN** AI 或用户阅读 SKILL.md
|
||||
- **THEN** 文档必须为每种文档格式(DOCX/XLSX/PPTX/PDF/HTML)和平台提供完整的 `uv run --with` 命令示例
|
||||
- **AND** 命令必须包含所有必需的依赖包
|
||||
|
||||
#### Scenario: README.md 包含开发依赖速查表
|
||||
- **WHEN** 开发者阅读 README.md
|
||||
- **THEN** 文档必须提供测试每种格式的 `uv run --with` 命令示例
|
||||
- **AND** 必须包含特殊平台的版本约束说明(如 macOS Intel)
|
||||
|
||||
### Requirement: 按需加载依赖
|
||||
系统必须使用 `uv run --with` 方式按需加载依赖,无需预先安装 extras 组合。
|
||||
|
||||
#### Scenario: 运行 PDF 解析
|
||||
- **WHEN** 用户执行 `uv run --with docling --with pypdf --with chardet scripts/lyxy_document_reader.py file.pdf`
|
||||
- **THEN** 系统必须自动安装这些依赖(如果尚未安装)
|
||||
- **AND** 必须成功执行脚本
|
||||
|
||||
#### Scenario: 测试 DOCX reader
|
||||
- **WHEN** 开发者执行 `uv run --with docling --with python-docx ... pytest tests/test_readers/test_docx/`
|
||||
- **THEN** 系统必须只安装指定的依赖
|
||||
- **AND** 必须成功运行测试
|
||||
|
||||
### Requirement: 平台特定版本约束
|
||||
系统必须在文档和命令中明确说明特殊平台的版本约束。
|
||||
|
||||
#### Scenario: macOS Intel 的 PDF 解析
|
||||
- **WHEN** 用户在 macOS x86_64 平台阅读 PDF 解析说明
|
||||
- **THEN** 文档必须明确说明需要 Python 3.12
|
||||
- **AND** 命令必须包含版本约束:`--with "docling==2.40.0" --with "docling-parse==4.0.0" --with "numpy<2"`
|
||||
- **AND** 必须说明原因:docling-parse 5.x 无 x86_64 wheel
|
||||
|
||||
#### Scenario: 其他平台使用最新版本
|
||||
- **WHEN** 用户在 macOS ARM 或 Linux 平台
|
||||
- **THEN** 命令可以省略版本号,使用最新兼容版本
|
||||
- **AND** 文档必须说明这是可行的
|
||||
|
||||
### Requirement: 移除 pyproject.toml
|
||||
系统必须移除 pyproject.toml 文件,不再使用 extras 声明依赖。
|
||||
|
||||
#### Scenario: 项目根目录不包含 pyproject.toml
|
||||
- **WHEN** 用户查看项目根目录
|
||||
- **THEN** 系统必须不包含 pyproject.toml 文件
|
||||
|
||||
#### Scenario: 依赖说明不在 pyproject.toml
|
||||
- **WHEN** 用户尝试查找依赖声明
|
||||
- **THEN** 系统必须引导用户查阅 SKILL.md 或 README.md
|
||||
|
||||
### Requirement: 移除 uv.lock
|
||||
系统必须移除 uv.lock 文件,每次 `uv run` 都是全新的依赖解析。
|
||||
|
||||
#### Scenario: 项目不包含 uv.lock
|
||||
- **WHEN** 用户查看项目根目录
|
||||
- **THEN** 系统必须不包含 uv.lock 文件
|
||||
|
||||
#### Scenario: 依赖版本由文档说明
|
||||
- **WHEN** 用户需要了解依赖版本约束
|
||||
- **THEN** 系统必须在 SKILL.md 或 README.md 中说明
|
||||
- **AND** 不依赖 uv.lock 锁定版本
|
||||
|
||||
### Requirement: 核心 chardet 依赖
|
||||
系统必须在所有 `uv run --with` 命令中包含 chardet 依赖。
|
||||
|
||||
#### Scenario: 所有格式都包含 chardet
|
||||
- **WHEN** 用户查阅任何格式的依赖命令
|
||||
- **THEN** 命令必须包含 `--with chardet`
|
||||
@@ -1,67 +0,0 @@
|
||||
[project]
|
||||
name = "lyxy-document"
|
||||
version = "0.1.0"
|
||||
description = "帮助AI工具读取转换文档到markdown的skill"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"chardet>=5.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
docx = [
|
||||
"docling>=2.0.0",
|
||||
"unstructured>=0.12.0",
|
||||
"markitdown>=0.1.0",
|
||||
"pypandoc-binary>=1.13.0",
|
||||
"python-docx>=1.1.0",
|
||||
"markdownify>=0.12.0",
|
||||
]
|
||||
xlsx = [
|
||||
"docling>=2.0.0",
|
||||
"unstructured>=0.12.0",
|
||||
"markitdown>=0.1.0",
|
||||
"pandas>=2.0.0",
|
||||
"tabulate>=0.9.0",
|
||||
]
|
||||
pptx = [
|
||||
"docling>=2.0.0",
|
||||
"unstructured>=0.12.0",
|
||||
"markitdown>=0.1.0",
|
||||
"python-pptx>=0.6.0",
|
||||
"markdownify>=0.12.0",
|
||||
]
|
||||
pdf = [
|
||||
"docling>=2.0.0",
|
||||
"unstructured>=0.12.0",
|
||||
"unstructured-paddleocr>=0.1.0",
|
||||
"markitdown>=0.1.0",
|
||||
"pypdf>=4.0.0",
|
||||
"markdownify>=0.12.0",
|
||||
]
|
||||
html = [
|
||||
"trafilatura>=1.10.0",
|
||||
"domscribe>=0.1.0",
|
||||
"markitdown>=0.1.0",
|
||||
"html2text>=2024.2.26",
|
||||
"beautifulsoup4>=4.12.0",
|
||||
]
|
||||
http = [
|
||||
"httpx>=0.27.0",
|
||||
"pyppeteer>=2.0.0",
|
||||
"selenium>=4.18.0",
|
||||
]
|
||||
office = [
|
||||
"lyxy-document[docx,xlsx,pptx,pdf]",
|
||||
]
|
||||
web = [
|
||||
"lyxy-document[html,http]",
|
||||
]
|
||||
full = [
|
||||
"lyxy-document[office,web]",
|
||||
]
|
||||
dev = [
|
||||
"pytest>=8.0.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"reportlab>=4.0.0",
|
||||
]
|
||||
@@ -4,12 +4,12 @@ import argparse
|
||||
import sys
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.core.exceptions import FileDetectionError, ReaderNotFoundError
|
||||
from scripts.core.markdown import (
|
||||
from core.exceptions import FileDetectionError, ReaderNotFoundError
|
||||
from core.markdown import (
|
||||
normalize_markdown_whitespace,
|
||||
remove_markdown_images,
|
||||
)
|
||||
from scripts.readers import BaseReader
|
||||
from readers import BaseReader
|
||||
|
||||
|
||||
def parse_input(
|
||||
|
||||
@@ -6,6 +6,12 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
# 将 scripts/ 目录添加到 sys.path,支持从任意位置执行脚本
|
||||
scripts_dir = Path(__file__).resolve().parent
|
||||
if str(scripts_dir) not in sys.path:
|
||||
sys.path.append(str(scripts_dir))
|
||||
|
||||
# 抑制第三方库的进度条和日志,仅保留解析结果输出
|
||||
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||||
@@ -20,14 +26,14 @@ logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
|
||||
logging.getLogger('docling').setLevel(logging.ERROR)
|
||||
logging.getLogger('unstructured').setLevel(logging.ERROR)
|
||||
|
||||
from scripts.core import (
|
||||
from core import (
|
||||
FileDetectionError,
|
||||
ReaderNotFoundError,
|
||||
output_result,
|
||||
parse_input,
|
||||
process_content,
|
||||
)
|
||||
from scripts.readers import READERS
|
||||
from readers import READERS
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.readers.base import BaseReader
|
||||
from scripts.utils import is_valid_docx
|
||||
from readers.base import BaseReader
|
||||
from utils import is_valid_docx
|
||||
|
||||
from . import docling
|
||||
from . import unstructured
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_docling
|
||||
from readers._utils import parse_via_docling
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_markitdown
|
||||
from readers._utils import parse_via_markitdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import build_markdown_table, safe_open_zip
|
||||
from readers._utils import build_markdown_table, safe_open_zip
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import build_markdown_table
|
||||
from readers._utils import build_markdown_table
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import convert_unstructured_to_markdown
|
||||
from readers._utils import convert_unstructured_to_markdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -4,9 +4,9 @@ import os
|
||||
import tempfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.readers.base import BaseReader
|
||||
from scripts.utils import is_url
|
||||
from scripts.utils import encoding_detection
|
||||
from readers.base import BaseReader
|
||||
from utils import is_url
|
||||
from utils import encoding_detection
|
||||
|
||||
from . import cleaner
|
||||
from .downloader import download_html
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.readers.base import BaseReader
|
||||
from scripts.utils import is_valid_pdf
|
||||
from readers.base import BaseReader
|
||||
from utils import is_valid_pdf
|
||||
|
||||
from . import docling_ocr
|
||||
from . import unstructured_ocr
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_markitdown
|
||||
from readers._utils import parse_via_markitdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import convert_unstructured_to_markdown
|
||||
from readers._utils import convert_unstructured_to_markdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import convert_unstructured_to_markdown
|
||||
from readers._utils import convert_unstructured_to_markdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.readers.base import BaseReader
|
||||
from scripts.utils import is_valid_pptx
|
||||
from readers.base import BaseReader
|
||||
from utils import is_valid_pptx
|
||||
|
||||
from . import docling
|
||||
from . import unstructured
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_docling
|
||||
from readers._utils import parse_via_docling
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_markitdown
|
||||
from readers._utils import parse_via_markitdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import build_markdown_table, flush_list_stack
|
||||
from readers._utils import build_markdown_table, flush_list_stack
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import build_markdown_table, flush_list_stack
|
||||
from readers._utils import build_markdown_table, flush_list_stack
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import convert_unstructured_to_markdown
|
||||
from readers._utils import convert_unstructured_to_markdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.readers.base import BaseReader
|
||||
from scripts.utils import is_valid_xlsx
|
||||
from readers.base import BaseReader
|
||||
from utils import is_valid_xlsx
|
||||
|
||||
from . import docling
|
||||
from . import unstructured
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_docling
|
||||
from readers._utils import parse_via_docling
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import parse_via_markitdown
|
||||
from readers._utils import parse_via_markitdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import build_markdown_table, safe_open_zip
|
||||
from readers._utils import build_markdown_table, safe_open_zip
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.readers._utils import convert_unstructured_to_markdown
|
||||
from readers._utils import convert_unstructured_to_markdown
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from scripts.config import Config
|
||||
from config import Config
|
||||
|
||||
|
||||
def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -1 +1,12 @@
|
||||
"""Tests package for lyxy-document."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 将 scripts/ 目录添加到 sys.path
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
scripts_dir = project_root / "scripts"
|
||||
if str(scripts_dir) not in sys.path:
|
||||
sys.path.insert(0, str(scripts_dir))
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,16 @@
|
||||
"""测试配置和共享 fixtures。"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 将 scripts/ 目录添加到 sys.path(必须在最顶部,在其他导入之前)
|
||||
project_root = Path(__file__).resolve().parent.parent # tests/ 的父目录是项目根目录
|
||||
scripts_dir = project_root / "scripts"
|
||||
if str(scripts_dir) not in sys.path:
|
||||
sys.path.insert(0, str(scripts_dir))
|
||||
|
||||
import pytest
|
||||
from scripts.readers import READERS
|
||||
from readers import READERS
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from io import StringIO
|
||||
from contextlib import redirect_stdout, redirect_stderr
|
||||
|
||||
@@ -22,7 +23,13 @@ def cli_runner():
|
||||
Returns:
|
||||
tuple: (stdout, stderr, exit_code)
|
||||
"""
|
||||
from scripts.lyxy_document_reader import main
|
||||
# 将 scripts/ 目录添加到 sys.path
|
||||
project_root = Path(__file__).resolve().parent.parent.parent # tests/test_cli/ 的父目录是 tests/,再父目录是项目根目录
|
||||
scripts_dir = project_root / "scripts"
|
||||
if str(scripts_dir) not in sys.path:
|
||||
sys.path.insert(0, str(scripts_dir))
|
||||
|
||||
from lyxy_document_reader import main
|
||||
|
||||
# 保存原始 sys.argv 和 sys.exit
|
||||
original_argv = sys.argv
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""测试 Markdown 工具函数。"""
|
||||
|
||||
from scripts.core import (
|
||||
from core import (
|
||||
get_heading_level,
|
||||
extract_titles,
|
||||
normalize_markdown_whitespace,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试所有 DOCX Readers 的一致性。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.docx import (
|
||||
from readers.docx import (
|
||||
docling,
|
||||
unstructured,
|
||||
pypandoc,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.docx import docling
|
||||
from readers.docx import docling
|
||||
|
||||
|
||||
class TestDoclingDocxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.docx import markitdown
|
||||
from readers.docx import markitdown
|
||||
|
||||
|
||||
class TestMarkitdownDocxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.docx import native_xml
|
||||
from readers.docx import native_xml
|
||||
|
||||
|
||||
class TestNativeXmlDocxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.docx import pypandoc
|
||||
from readers.docx import pypandoc
|
||||
|
||||
|
||||
class TestPypandocDocxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.docx import DocxReader
|
||||
from readers.docx import DocxReader
|
||||
|
||||
|
||||
class TestPythonDocxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.docx import unstructured
|
||||
from readers.docx import unstructured
|
||||
|
||||
|
||||
class TestUnstructuredDocxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试所有 HTML Readers 的一致性。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.html import (
|
||||
from readers.html import (
|
||||
html2text,
|
||||
markitdown,
|
||||
trafilatura,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Domscribe HTML Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.html import domscribe
|
||||
from readers.html import domscribe
|
||||
|
||||
|
||||
class TestDomscribeHtmlReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.html import HtmlReader
|
||||
from readers.html import HtmlReader
|
||||
|
||||
|
||||
class TestHtml2TextReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 MarkItDown HTML Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.html import markitdown
|
||||
from readers.html import markitdown
|
||||
|
||||
|
||||
class TestMarkitdownHtmlReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Trafilatura HTML Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.html import trafilatura
|
||||
from readers.html import trafilatura
|
||||
|
||||
|
||||
class TestTrafilaturaHtmlReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试所有 PDF Readers 的一致性。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pdf import (
|
||||
from readers.pdf import (
|
||||
docling,
|
||||
docling_ocr,
|
||||
markitdown,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Docling OCR PDF Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pdf import docling_ocr
|
||||
from readers.pdf import docling_ocr
|
||||
|
||||
|
||||
class TestDoclingOcrPdfReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Docling PDF Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pdf import docling
|
||||
from readers.pdf import docling
|
||||
|
||||
|
||||
class TestDoclingPdfReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 MarkItDown PDF Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pdf import markitdown
|
||||
from readers.pdf import markitdown
|
||||
|
||||
|
||||
class TestMarkitdownPdfReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.pdf import PdfReader
|
||||
from readers.pdf import PdfReader
|
||||
|
||||
|
||||
class TestPypdfReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Unstructured OCR PDF Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pdf import unstructured_ocr
|
||||
from readers.pdf import unstructured_ocr
|
||||
|
||||
|
||||
class TestUnstructuredOcrPdfReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Unstructured PDF Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pdf import unstructured
|
||||
from readers.pdf import unstructured
|
||||
|
||||
|
||||
class TestUnstructuredPdfReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试所有 PPTX Readers 的一致性。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pptx import (
|
||||
from readers.pptx import (
|
||||
docling,
|
||||
markitdown,
|
||||
native_xml,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Docling PPTX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pptx import docling
|
||||
from readers.pptx import docling
|
||||
|
||||
|
||||
class TestDoclingPptxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 MarkItDown PPTX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pptx import markitdown
|
||||
from readers.pptx import markitdown
|
||||
|
||||
|
||||
class TestMarkitdownPptxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Native XML PPTX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pptx import native_xml
|
||||
from readers.pptx import native_xml
|
||||
|
||||
|
||||
class TestNativeXmlPptxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.pptx import PptxReader
|
||||
from readers.pptx import PptxReader
|
||||
|
||||
|
||||
class TestPythonPptxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Unstructured PPTX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.pptx import unstructured
|
||||
from readers.pptx import unstructured
|
||||
|
||||
|
||||
class TestUnstructuredPptxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import zipfile
|
||||
import pytest
|
||||
from scripts.readers._utils import (
|
||||
from readers._utils import (
|
||||
parse_via_markitdown,
|
||||
parse_via_docling,
|
||||
build_markdown_table,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试所有 XLSX Readers 的一致性。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.xlsx import (
|
||||
from readers.xlsx import (
|
||||
docling,
|
||||
markitdown,
|
||||
native_xml,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Docling XLSX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.xlsx import docling
|
||||
from readers.xlsx import docling
|
||||
|
||||
|
||||
class TestDoclingXlsxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 MarkItDown XLSX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.xlsx import markitdown
|
||||
from readers.xlsx import markitdown
|
||||
|
||||
|
||||
class TestMarkitdownXlsxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Native XML XLSX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.xlsx import native_xml
|
||||
from readers.xlsx import native_xml
|
||||
|
||||
|
||||
class TestNativeXmlXlsxReaderParse:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from scripts.readers.xlsx import XlsxReader
|
||||
from readers.xlsx import XlsxReader
|
||||
|
||||
|
||||
class TestPandasXlsxReaderParse:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""测试 Unstructured XLSX Reader 的解析功能。"""
|
||||
|
||||
import pytest
|
||||
from scripts.readers.xlsx import unstructured
|
||||
from readers.xlsx import unstructured
|
||||
|
||||
|
||||
class TestUnstructuredXlsxReaderParse:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""测试文件检测工具函数。"""
|
||||
|
||||
from scripts.utils import is_url, is_html_file
|
||||
from utils import is_url, is_html_file
|
||||
|
||||
|
||||
class TestIsUrl:
|
||||
|
||||
Reference in New Issue
Block a user