test: 添加全面的测试套件,覆盖所有 Reader 实现

- 测试数量从 83 个增加到 193 个 (+132%)
- 代码覆盖率从 48% 提升到 69% (+44%)
- 为每种文档格式的所有 Reader 实现创建独立测试
- 添加跨 Reader 的一致性验证测试
- 新增 4 个测试规范 (cli-testing, exception-testing, reader-testing, test-fixtures)
- 更新 README 测试统计信息

测试覆盖:
- DOCX: python-docx, markitdown, docling, native-xml, pypandoc, unstructured
- PDF: pypdf, markitdown, docling, docling-ocr, unstructured, unstructured-ocr
- HTML: html2text, markitdown, trafilatura, domscribe
- PPTX: python-pptx, markitdown, docling, native-xml, unstructured
- XLSX: pandas, markitdown, docling, native-xml, unstructured
- CLI: 所有命令行选项和错误处理

所有 193 个测试通过。
This commit is contained in:
2026-03-08 22:20:21 +08:00
parent c35bbc90b5
commit 7eab1dcef1
53 changed files with 3094 additions and 259 deletions

1
scripts/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""lyxy-document 核心模块。"""

View File

@@ -50,13 +50,13 @@ def output_result(
elif args.lines:
print(len(content.split("\n")))
elif args.titles:
from core.markdown import extract_titles
from scripts.core.markdown import extract_titles
titles = extract_titles(content)
for title in titles:
print(title)
elif args.title_content:
from core.markdown import extract_title_content
from scripts.core.markdown import extract_title_content
title_content = extract_title_content(content, args.title_content)
if title_content is None:
@@ -64,7 +64,7 @@ def output_result(
sys.exit(1)
print(title_content, end="")
elif args.search:
from core.markdown import search_markdown
from scripts.core.markdown import search_markdown
search_result = search_markdown(content, args.search, args.context)
if search_result is None:

View File

@@ -32,7 +32,7 @@ class DocxReader(BaseReader):
return [".docx"]
def supports(self, file_path: str) -> bool:
return file_path.endswith('.docx')
return file_path.lower().endswith('.docx')
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
failures = []

View File

@@ -31,7 +31,7 @@ class HtmlReader(BaseReader):
return [".html", ".htm"]
def supports(self, file_path: str) -> bool:
return is_url(file_path) or file_path.endswith(('.html', '.htm'))
return is_url(file_path) or file_path.lower().endswith(('.html', '.htm'))
def download_and_parse(self, url: str) -> Tuple[Optional[str], List[str]]:
"""下载 URL 并解析"""
@@ -74,6 +74,10 @@ class HtmlReader(BaseReader):
if is_url(file_path):
return self.download_and_parse(file_path)
# 检查文件是否存在
if not os.path.exists(file_path):
return None, ["文件不存在"]
# 读取本地 HTML 文件,使用编码检测
html_content, error = encoding_detection.read_text_file(file_path)
if error:

View File

@@ -32,7 +32,7 @@ class PdfReader(BaseReader):
return [".pdf"]
def supports(self, file_path: str) -> bool:
return file_path.endswith('.pdf')
return file_path.lower().endswith('.pdf')
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
failures = []

View File

@@ -30,7 +30,7 @@ class PptxReader(BaseReader):
return [".pptx"]
def supports(self, file_path: str) -> bool:
return file_path.endswith('.pptx')
return file_path.lower().endswith('.pptx')
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
failures = []

View File

@@ -30,7 +30,7 @@ class XlsxReader(BaseReader):
return [".xlsx"]
def supports(self, file_path: str) -> bool:
return file_path.endswith('.xlsx')
return file_path.lower().endswith('.xlsx')
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
failures = []