test: 添加全面的测试套件，覆盖所有 Reader 实现

- 测试数量从 83 个增加到 193 个 (+132%) - 代码覆盖率从 48% 提升到 69% (+44%) - 为每种文档格式的所有 Reader 实现创建独立测试 - 添加跨 Reader 的一致性验证测试 - 新增 4 个测试规范 (cli-testing, exception-testing, reader-testing, test-fixtures) - 更新 README 测试统计信息测试覆盖: - DOCX: python-docx, markitdown, docling, native-xml, pypandoc, unstructured - PDF: pypdf, markitdown, docling, docling-ocr, unstructured, unstructured-ocr - HTML: html2text, markitdown, trafilatura, domscribe - PPTX: python-pptx, markitdown, docling, native-xml, unstructured - XLSX: pandas, markitdown, docling, native-xml, unstructured - CLI: 所有命令行选项和错误处理所有 193 个测试通过。
2026-03-08 22:20:21 +08:00
parent c35bbc90b5
commit 7eab1dcef1
53 changed files with 3094 additions and 259 deletions
--- a/tests/test_cli/conftest.py
+++ b/tests/test_cli/conftest.py
@@ -0,0 +1,87 @@
+"""CLI 测试专用 fixtures。"""
+
+import pytest
+import sys
+from io import StringIO
+from contextlib import redirect_stdout, redirect_stderr
+
+
+@pytest.fixture
+def cli_runner():
+    """CLI 运行器 fixture，用于调用 main() 函数并捕获输出。
+
+    Returns:
+        function: 接受 args 列表，返回 (stdout, stderr, exit_code) 元组
+    """
+    def _run_cli(args):
+        """运行 CLI 并捕获输出。
+
+        Args:
+            args: 命令行参数列表（不包含程序名）
+
+        Returns:
+            tuple: (stdout, stderr, exit_code)
+        """
+        from scripts.lyxy_document_reader import main
+
+        # 保存原始 sys.argv 和 sys.exit
+        original_argv = sys.argv
+        original_exit = sys.exit
+
+        stdout_capture = StringIO()
+        stderr_capture = StringIO()
+        exit_code = 0
+
+        def mock_exit(code=0):
+            nonlocal exit_code
+            exit_code = code
+            raise SystemExit(code)
+
+        try:
+            # 设置命令行参数
+            sys.argv = ['lyxy_document_reader'] + args
+            sys.exit = mock_exit
+
+            # 捕获输出
+            with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture):
+                try:
+                    main()
+                except SystemExit:
+                    pass
+
+        finally:
+            # 恢复原始状态
+            sys.argv = original_argv
+            sys.exit = original_exit
+
+        return stdout_capture.getvalue(), stderr_capture.getvalue(), exit_code
+
+    return _run_cli
+
+
+@pytest.fixture
+def temp_test_file(tmp_path, temp_docx, temp_pdf, temp_html, temp_pptx, temp_xlsx):
+    """根据格式类型创建临时测试文件的 fixture 工厂。
+
+    Args:
+        format_type: 文件格式类型 ('docx', 'pdf', 'html', 'pptx', 'xlsx')
+        **kwargs: 传递给对应 fixture 的参数
+
+    Returns:
+        str: 临时文件路径
+    """
+    def _create_file(format_type, **kwargs):
+        if format_type == 'docx':
+            return temp_docx(**kwargs)
+        elif format_type == 'pdf':
+            return temp_pdf(**kwargs)
+        elif format_type == 'html':
+            return temp_html(**kwargs)
+        elif format_type == 'pptx':
+            return temp_pptx(**kwargs)
+        elif format_type == 'xlsx':
+            return temp_xlsx(**kwargs)
+        else:
+            raise ValueError(f"不支持的格式类型: {format_type}")
+
+    return _create_file
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -0,0 +1,201 @@
+"""测试 CLI 主函数功能。"""
+
+import pytest
+import os
+
+
+class TestCLIDefaultOutput:
+    """测试 CLI 默认输出功能。"""
+
+    def test_default_output_docx(self, cli_runner, temp_docx):
+        """测试默认输出 DOCX 文件的 Markdown 内容。"""
+        file_path = temp_docx(paragraphs=["测试内容段落"])
+
+        stdout, stderr, exit_code = cli_runner([file_path])
+
+        assert exit_code == 0
+        assert "测试内容段落" in stdout
+        assert len(stdout.strip()) > 0
+
+    def test_default_output_pdf(self, cli_runner, temp_pdf):
+        """测试默认输出 PDF 文件的 Markdown 内容。"""
+        file_path = temp_pdf(text="PDF测试内容")
+
+        stdout, stderr, exit_code = cli_runner([file_path])
+
+        assert exit_code == 0
+        # PDF 解析可能有格式差异，只验证有输出
+        assert len(stdout.strip()) > 0
+
+    def test_default_output_html(self, cli_runner, temp_html):
+        """测试默认输出 HTML 文件的 Markdown 内容。"""
+        file_path = temp_html(content="<h1>HTML标题</h1><p>HTML内容</p>")
+
+        stdout, stderr, exit_code = cli_runner([file_path])
+
+        assert exit_code == 0
+        assert "HTML标题" in stdout or "HTML内容" in stdout
+
+
+class TestCLICountOption:
+    """测试 CLI 字数统计功能。"""
+
+    def test_count_option(self, cli_runner, temp_docx):
+        """测试 -c 选项统计字数。"""
+        file_path = temp_docx(paragraphs=["测试内容"])
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-c"])
+
+        assert exit_code == 0
+        # 输出应该是一个数字
+        assert stdout.strip().isdigit()
+        count = int(stdout.strip())
+        assert count > 0
+
+    def test_count_option_long_form(self, cli_runner, temp_docx):
+        """测试 --count 选项。"""
+        file_path = temp_docx(paragraphs=["测试"])
+
+        stdout, stderr, exit_code = cli_runner([file_path, "--count"])
+
+        assert exit_code == 0
+        assert stdout.strip().isdigit()
+
+
+class TestCLILinesOption:
+    """测试 CLI 行数统计功能。"""
+
+    def test_lines_option(self, cli_runner, temp_docx):
+        """测试 -l 选项统计行数。"""
+        file_path = temp_docx(paragraphs=["第一行", "第二行", "第三行"])
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-l"])
+
+        assert exit_code == 0
+        # 输出应该是一个数字
+        assert stdout.strip().isdigit()
+        lines = int(stdout.strip())
+        assert lines > 0
+
+
+class TestCLITitlesOption:
+    """测试 CLI 标题提取功能。"""
+
+    def test_titles_option(self, cli_runner, temp_docx):
+        """测试 -t 选项提取标题。"""
+        file_path = temp_docx(
+            headings=[(1, "一级标题"), (2, "二级标题")],
+            paragraphs=["普通段落"]
+        )
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-t"])
+
+        assert exit_code == 0
+        # 输出应该包含标题
+        assert "一级标题" in stdout
+        assert "二级标题" in stdout
+        # 不应该包含普通段落
+        assert "普通段落" not in stdout
+
+
+class TestCLITitleContentOption:
+    """测试 CLI 标题内容提取功能。"""
+
+    def test_title_content_option(self, cli_runner, temp_docx):
+        """测试 -tc 选项提取标题内容。"""
+        file_path = temp_docx(
+            headings=[(1, "目标标题")],
+            paragraphs=["标题下的内容"]
+        )
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-tc", "目标标题"])
+
+        assert exit_code == 0
+        assert "目标标题" in stdout
+        assert "标题下的内容" in stdout
+
+    def test_title_content_not_found(self, cli_runner, temp_docx):
+        """测试标题不存在时的错误处理。"""
+        file_path = temp_docx(paragraphs=["测试内容"])
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-tc", "不存在的标题"])
+
+        assert exit_code != 0
+        # 应该输出错误信息
+        output = stdout + stderr
+        assert "未找到" in output or "不存在" in output or "错误" in output
+
+
+class TestCLISearchOption:
+    """测试 CLI 搜索功能。"""
+
+    def test_search_option(self, cli_runner, temp_docx):
+        """测试 -s 选项搜索内容。"""
+        file_path = temp_docx(paragraphs=["包含关键词的段落", "其他内容"])
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-s", "关键词"])
+
+        assert exit_code == 0
+        assert "关键词" in stdout
+
+    def test_search_no_match(self, cli_runner, temp_docx):
+        """测试搜索无匹配时的错误处理。"""
+        file_path = temp_docx(paragraphs=["测试内容"])
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-s", "不存在的内容"])
+
+        assert exit_code != 0
+        # 应该输出错误信息
+        output = stdout + stderr
+        assert "未找到" in output or "无匹配" in output or "错误" in output
+
+    def test_search_with_context(self, cli_runner, temp_docx):
+        """测试 -n 选项设置上下文行数。"""
+        file_path = temp_docx(
+            paragraphs=["第一行", "第二行", "包含关键词的行", "第四行", "第五行"]
+        )
+
+        stdout, stderr, exit_code = cli_runner([file_path, "-s", "关键词", "-n", "2"])
+
+        assert exit_code == 0
+        assert "关键词" in stdout
+        # 应该包含上下文
+        assert "第二行" in stdout or "第四行" in stdout
+
+
+class TestCLIErrorHandling:
+    """测试 CLI 错误处理。"""
+
+    def test_file_not_exists(self, cli_runner, tmp_path):
+        """测试文件不存在时的错误处理。"""
+        non_existent = str(tmp_path / "non_existent.docx")
+
+        stdout, stderr, exit_code = cli_runner([non_existent])
+
+        assert exit_code != 0
+        output = stdout + stderr
+        assert "错误" in output or "不存在" in output
+
+    def test_unsupported_format(self, cli_runner, tmp_path):
+        """测试不支持的文件类型。"""
+        unsupported_file = tmp_path / "test.xyz"
+        unsupported_file.write_text("test content")
+
+        stdout, stderr, exit_code = cli_runner([str(unsupported_file)])
+
+        assert exit_code != 0
+        output = stdout + stderr
+        assert "reader" in output.lower() or "支持" in output
+
+    def test_all_readers_failed(self, cli_runner, tmp_path):
+        """测试所有 Reader 失败时的错误输出。"""
+        # 创建一个看起来像 DOCX 但实际损坏的文件
+        fake_docx = tmp_path / "fake.docx"
+        fake_docx.write_bytes(b"not a real docx file")
+
+        stdout, stderr, exit_code = cli_runner([str(fake_docx)])
+
+        assert exit_code != 0
+        output = stdout + stderr
+        # 应该列出失败原因
+        assert "失败" in output or "错误" in output