## 功能特性 - 建立统一的项目结构,包含 core/、readers/、utils/、tests/ 模块 - 迁移 lyxy-reader-office 的所有解析器(docx、xlsx、pptx、pdf) - 迁移 lyxy-reader-html 的所有解析器(html、url 下载) - 统一 CLI 入口为 lyxy_document_reader.py - 统一 Markdown 后处理逻辑 - 按文件类型组织 readers,每个解析器独立文件 - 依赖分组按文件类型细分(docx、xlsx、pptx、pdf、html、http) - PDF OCR 解析器优先,无参数控制 - 使用 logging 模块替代简单 print - 设计完整的单元测试结构 - 重写项目文档 ## 新增目录/文件 - core/ - 核心模块(异常体系、Markdown 工具、解析调度器) - readers/ - 格式阅读器(base.py + docx/xlsx/pptx/pdf/html) - utils/ - 工具函数(文件类型检测) - tests/ - 测试(conftest.py + test_core/ + test_readers/ + test_utils/) - lyxy_document_reader.py - 统一 CLI 入口 ## 依赖分组 - docx - DOCX 文档解析支持 - xlsx - XLSX 文档解析支持 - pptx - PPTX 文档解析支持 - pdf - PDF 文档解析支持(含 OCR) - html - HTML/URL 解析支持 - http - HTTP/URL 下载支持 - office - Office 格式组合(docx/xlsx/pptx/pdf) - web - Web 格式组合(html/http) - full - 完整功能 - dev - 开发依赖
93 lines
1.7 KiB
TOML
93 lines
1.7 KiB
TOML
[project]
|
|
name = "lyxy-document"
|
|
version = "0.1.0"
|
|
description = "帮助AI工具读取转换文档到markdown的skill"
|
|
readme = "README.md"
|
|
requires-python = ">=3.11"
|
|
dependencies = []
|
|
|
|
[project.optional-dependencies]
|
|
docx = [
|
|
"docling>=2.0.0",
|
|
"unstructured>=0.12.0",
|
|
"markitdown>=0.1.0",
|
|
"pypandoc-binary>=1.13.0",
|
|
"python-docx>=1.1.0",
|
|
"markdownify>=0.12.0",
|
|
]
|
|
xlsx = [
|
|
"docling>=2.0.0",
|
|
"unstructured>=0.12.0",
|
|
"markitdown>=0.1.0",
|
|
"pandas>=2.0.0",
|
|
"tabulate>=0.9.0",
|
|
]
|
|
pptx = [
|
|
"docling>=2.0.0",
|
|
"unstructured>=0.12.0",
|
|
"markitdown>=0.1.0",
|
|
"python-pptx>=0.6.0",
|
|
"markdownify>=0.12.0",
|
|
]
|
|
pdf = [
|
|
"docling>=2.0.0",
|
|
"unstructured>=0.12.0",
|
|
"unstructured-paddleocr>=0.1.0",
|
|
"markitdown>=0.1.0",
|
|
"pypdf>=4.0.0",
|
|
"markdownify>=0.12.0",
|
|
]
|
|
html = [
|
|
"trafilatura>=1.10.0",
|
|
"domscribe>=0.1.0",
|
|
"markitdown>=0.1.0",
|
|
"html2text>=2024.2.26",
|
|
"beautifulsoup4>=4.12.0",
|
|
]
|
|
http = [
|
|
"httpx>=0.27.0",
|
|
"pyppeteer>=2.0.0",
|
|
"selenium>=4.18.0",
|
|
]
|
|
office = [
|
|
"lyxy-document[docx,xlsx,pptx,pdf]",
|
|
]
|
|
web = [
|
|
"lyxy-document[html,http]",
|
|
]
|
|
full = [
|
|
"lyxy-document[office,web]",
|
|
]
|
|
dev = [
|
|
"pytest>=8.0.0",
|
|
"pytest-cov>=4.1.0",
|
|
"black>=24.0.0",
|
|
"isort>=5.13.0",
|
|
"mypy>=1.8.0",
|
|
]
|
|
|
|
[project.scripts]
|
|
lyxy-document-reader = "lyxy_document_reader:main"
|
|
|
|
[build-system]
|
|
requires = ["hatchling"]
|
|
build-backend = "hatchling.build"
|
|
|
|
[tool.black]
|
|
line-length = 100
|
|
target-version = ["py311"]
|
|
|
|
[tool.isort]
|
|
profile = "black"
|
|
line-length = 100
|
|
|
|
[tool.mypy]
|
|
python_version = "3.11"
|
|
warn_return_any = true
|
|
warn_unused_configs = true
|
|
disallow_untyped_defs = true
|
|
|
|
[tool.pytest.ini_options]
|
|
testpaths = ["tests"]
|
|
pythonpath = ["."]
|