Compare commits
2 Commits
25d748aa17
...
e53e64d386
| Author | SHA1 | Date | |
|---|---|---|---|
| e53e64d386 | |||
| 688933c228 |
42
README.md
42
README.md
@@ -70,8 +70,34 @@ DEPENDENCIES = {
|
|||||||
|
|
||||||
`--advice` 参数根据文件扩展名识别类型,检测当前平台,从 `config.DEPENDENCIES` 读取对应配置,生成 `uv run --with` 和 `pip install` 命令。
|
`--advice` 参数根据文件扩展名识别类型,检测当前平台,从 `config.DEPENDENCIES` 读取对应配置,生成 `uv run --with` 和 `pip install` 命令。
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 验证环境
|
||||||
|
|
||||||
|
首先验证项目可以正常运行:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 测试 --advice 功能(无需额外依赖)
|
||||||
|
uv run python scripts/lyxy_document_reader.py test.pdf --advice
|
||||||
|
```
|
||||||
|
|
||||||
|
### 运行基础测试
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 运行 CLI 测试(验证项目基本功能)
|
||||||
|
uv run \
|
||||||
|
--with pytest \
|
||||||
|
pytest tests/test_cli/test_main.py::TestCLIAdviceOption -v
|
||||||
|
```
|
||||||
|
|
||||||
## 开发指南
|
## 开发指南
|
||||||
|
|
||||||
|
### 测试前置依赖说明
|
||||||
|
|
||||||
|
由于 `HtmlReader` 模块在导入时会加载 `cleaner.py`,但 `cleaner.py` 中的第三方库已改为动态导入,因此无需额外依赖。
|
||||||
|
|
||||||
|
`beautifulsoup4` 和 `chardet` 仅在实际使用 HTML 功能时才需要,模块导入时不依赖。
|
||||||
|
|
||||||
### 如何添加新的 Reader
|
### 如何添加新的 Reader
|
||||||
|
|
||||||
1. 在 `scripts/readers/` 下创建新目录
|
1. 在 `scripts/readers/` 下创建新目录
|
||||||
@@ -88,7 +114,6 @@ DEPENDENCIES = {
|
|||||||
uv run \
|
uv run \
|
||||||
--with pytest \
|
--with pytest \
|
||||||
--with pytest-cov \
|
--with pytest-cov \
|
||||||
--with chardet \
|
|
||||||
pytest
|
pytest
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -102,7 +127,6 @@ uv run \
|
|||||||
--with pypandoc-binary \
|
--with pypandoc-binary \
|
||||||
--with python-docx \
|
--with python-docx \
|
||||||
--with markdownify \
|
--with markdownify \
|
||||||
--with chardet \
|
|
||||||
pytest tests/test_readers/test_docx/
|
pytest tests/test_readers/test_docx/
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -115,7 +139,6 @@ uv run \
|
|||||||
--with "markitdown[xlsx]" \
|
--with "markitdown[xlsx]" \
|
||||||
--with pandas \
|
--with pandas \
|
||||||
--with tabulate \
|
--with tabulate \
|
||||||
--with chardet \
|
|
||||||
pytest tests/test_readers/test_xlsx/
|
pytest tests/test_readers/test_xlsx/
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -128,7 +151,6 @@ uv run \
|
|||||||
--with "markitdown[pptx]" \
|
--with "markitdown[pptx]" \
|
||||||
--with python-pptx \
|
--with python-pptx \
|
||||||
--with markdownify \
|
--with markdownify \
|
||||||
--with chardet \
|
|
||||||
pytest tests/test_readers/test_pptx/
|
pytest tests/test_readers/test_pptx/
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -142,7 +164,6 @@ uv run \
|
|||||||
--with "markitdown[pdf]" \
|
--with "markitdown[pdf]" \
|
||||||
--with pypdf \
|
--with pypdf \
|
||||||
--with markdownify \
|
--with markdownify \
|
||||||
--with chardet \
|
|
||||||
--with reportlab \
|
--with reportlab \
|
||||||
pytest tests/test_readers/test_pdf/
|
pytest tests/test_readers/test_pdf/
|
||||||
|
|
||||||
@@ -156,7 +177,6 @@ uv run \
|
|||||||
--with "markitdown[pdf]" \
|
--with "markitdown[pdf]" \
|
||||||
--with pypdf \
|
--with pypdf \
|
||||||
--with markdownify \
|
--with markdownify \
|
||||||
--with chardet \
|
|
||||||
--with reportlab \
|
--with reportlab \
|
||||||
pytest tests/test_readers/test_pdf/
|
pytest tests/test_readers/test_pdf/
|
||||||
```
|
```
|
||||||
@@ -177,17 +197,20 @@ uv run \
|
|||||||
|
|
||||||
#### 运行特定测试文件或方法
|
#### 运行特定测试文件或方法
|
||||||
```bash
|
```bash
|
||||||
# 运行特定测试文件
|
# 运行特定测试文件(CLI 测试无需额外依赖)
|
||||||
uv run \
|
uv run \
|
||||||
--with pytest \
|
--with pytest \
|
||||||
--with chardet \
|
|
||||||
pytest tests/test_cli/test_main.py
|
pytest tests/test_cli/test_main.py
|
||||||
|
|
||||||
|
# 仅运行 --advice 相关测试(不需要额外依赖)
|
||||||
|
uv run \
|
||||||
|
--with pytest \
|
||||||
|
pytest tests/test_cli/test_main.py::TestCLIAdviceOption
|
||||||
|
|
||||||
# 运行特定测试类或方法
|
# 运行特定测试类或方法
|
||||||
uv run \
|
uv run \
|
||||||
--with pytest \
|
--with pytest \
|
||||||
--with docling \
|
--with docling \
|
||||||
--with chardet \
|
|
||||||
pytest tests/test_cli/test_main.py::TestCLIDefaultOutput::test_default_output_docx
|
pytest tests/test_cli/test_main.py::TestCLIDefaultOutput::test_default_output_docx
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -196,7 +219,6 @@ uv run \
|
|||||||
uv run \
|
uv run \
|
||||||
--with pytest \
|
--with pytest \
|
||||||
--with pytest-cov \
|
--with pytest-cov \
|
||||||
--with chardet \
|
|
||||||
pytest --cov=scripts --cov-report=term-missing
|
pytest --cov=scripts --cov-report=term-missing
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -30,8 +30,7 @@ DEPENDENCIES = {
|
|||||||
"unstructured[pdf]",
|
"unstructured[pdf]",
|
||||||
"markitdown[pdf]",
|
"markitdown[pdf]",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
"markdownify",
|
"markdownify"
|
||||||
"chardet"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"Darwin-x86_64": {
|
"Darwin-x86_64": {
|
||||||
@@ -42,8 +41,7 @@ DEPENDENCIES = {
|
|||||||
"numpy<2",
|
"numpy<2",
|
||||||
"markitdown[pdf]",
|
"markitdown[pdf]",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
"markdownify",
|
"markdownify"
|
||||||
"chardet"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -56,8 +54,7 @@ DEPENDENCIES = {
|
|||||||
"markitdown[docx]",
|
"markitdown[docx]",
|
||||||
"pypandoc-binary",
|
"pypandoc-binary",
|
||||||
"python-docx",
|
"python-docx",
|
||||||
"markdownify",
|
"markdownify"
|
||||||
"chardet"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -69,8 +66,7 @@ DEPENDENCIES = {
|
|||||||
"unstructured[xlsx]",
|
"unstructured[xlsx]",
|
||||||
"markitdown[xlsx]",
|
"markitdown[xlsx]",
|
||||||
"pandas",
|
"pandas",
|
||||||
"tabulate",
|
"tabulate"
|
||||||
"chardet"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -82,8 +78,7 @@ DEPENDENCIES = {
|
|||||||
"unstructured[pptx]",
|
"unstructured[pptx]",
|
||||||
"markitdown[pptx]",
|
"markitdown[pptx]",
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"markdownify",
|
"markdownify"
|
||||||
"chardet"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -3,8 +3,6 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user