移除对旧版 .doc 和 .ppt 格式的支持,以简化项目架构和减少维护负担。 变更内容: - 删除 scripts/readers/doc/ 目录 - 删除 scripts/readers/ppt/ 目录 - 从 readers/__init__.py 中移除 DocReader 和 PptReader - 从 utils/file_detection.py 中移除 is_valid_doc 和 is_valid_ppt - 从 config.py 中移除 doc 和 ppt 依赖配置 - 从 advice_generator.py 中移除相关映射 - 更新 CLI 帮助文档 - 更新 README.md 文档 - 删除相关测试用例 - 删除相关规范文档
244 lines
8.1 KiB
Python
244 lines
8.1 KiB
Python
"""测试 CLI 主函数功能。"""
|
|
|
|
import pytest
|
|
import os
|
|
|
|
|
|
class TestCLIAdviceOption:
|
|
"""测试 CLI --advice 参数功能。"""
|
|
|
|
def test_advice_option_pdf(self, cli_runner):
|
|
"""测试 -a/--advice 选项对 PDF 文件。"""
|
|
stdout, stderr, exit_code = cli_runner(["test.pdf", "-a"])
|
|
|
|
assert exit_code == 0
|
|
assert "文件类型: PDF" in stdout
|
|
assert "[uv 命令]" in stdout
|
|
assert "[python 命令]" in stdout
|
|
|
|
def test_advice_option_docx(self, cli_runner):
|
|
"""测试 --advice 选项对 DOCX 文件。"""
|
|
stdout, stderr, exit_code = cli_runner(["test.docx", "--advice"])
|
|
|
|
assert exit_code == 0
|
|
assert "文件类型: DOCX" in stdout
|
|
|
|
def test_advice_option_url(self, cli_runner):
|
|
"""测试 --advice 选项对 URL。"""
|
|
stdout, stderr, exit_code = cli_runner(["https://example.com", "--advice"])
|
|
|
|
assert exit_code == 0
|
|
assert "文件类型: HTML" in stdout
|
|
|
|
def test_advice_option_unknown(self, cli_runner):
|
|
"""测试 --advice 选项对未知文件类型。"""
|
|
stdout, stderr, exit_code = cli_runner(["test.xyz", "--advice"])
|
|
|
|
assert exit_code != 0
|
|
output = stdout + stderr
|
|
assert "无法识别" in output or "错误" in output
|
|
|
|
def test_advice_option_xls(self, cli_runner):
|
|
"""测试 --advice 选项对 XLS 文件。"""
|
|
stdout, stderr, exit_code = cli_runner(["test.xls", "--advice"])
|
|
|
|
assert exit_code == 0
|
|
assert "文件类型: XLS" in stdout
|
|
|
|
|
|
class TestCLIDefaultOutput:
|
|
"""测试 CLI 默认输出功能。"""
|
|
|
|
def test_default_output_docx(self, cli_runner, temp_docx):
|
|
"""测试默认输出 DOCX 文件的 Markdown 内容。"""
|
|
file_path = temp_docx(paragraphs=["测试内容段落"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path])
|
|
|
|
assert exit_code == 0
|
|
assert "测试内容段落" in stdout
|
|
assert len(stdout.strip()) > 0
|
|
|
|
def test_default_output_pdf(self, cli_runner, temp_pdf):
|
|
"""测试默认输出 PDF 文件的 Markdown 内容。"""
|
|
file_path = temp_pdf(text="PDF测试内容")
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path])
|
|
|
|
assert exit_code == 0
|
|
# PDF 解析可能有格式差异,只验证有输出
|
|
assert len(stdout.strip()) > 0
|
|
|
|
def test_default_output_html(self, cli_runner, temp_html):
|
|
"""测试默认输出 HTML 文件的 Markdown 内容。"""
|
|
file_path = temp_html(content="<h1>HTML标题</h1><p>HTML内容</p>")
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path])
|
|
|
|
assert exit_code == 0
|
|
assert "HTML标题" in stdout or "HTML内容" in stdout
|
|
|
|
|
|
class TestCLICountOption:
|
|
"""测试 CLI 字数统计功能。"""
|
|
|
|
def test_count_option(self, cli_runner, temp_docx):
|
|
"""测试 -c 选项统计字数。"""
|
|
file_path = temp_docx(paragraphs=["测试内容"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-c"])
|
|
|
|
assert exit_code == 0
|
|
# 输出应该是一个数字
|
|
assert stdout.strip().isdigit()
|
|
count = int(stdout.strip())
|
|
assert count > 0
|
|
|
|
def test_count_option_long_form(self, cli_runner, temp_docx):
|
|
"""测试 --count 选项。"""
|
|
file_path = temp_docx(paragraphs=["测试"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "--count"])
|
|
|
|
assert exit_code == 0
|
|
assert stdout.strip().isdigit()
|
|
|
|
|
|
class TestCLILinesOption:
|
|
"""测试 CLI 行数统计功能。"""
|
|
|
|
def test_lines_option(self, cli_runner, temp_docx):
|
|
"""测试 -l 选项统计行数。"""
|
|
file_path = temp_docx(paragraphs=["第一行", "第二行", "第三行"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-l"])
|
|
|
|
assert exit_code == 0
|
|
# 输出应该是一个数字
|
|
assert stdout.strip().isdigit()
|
|
lines = int(stdout.strip())
|
|
assert lines > 0
|
|
|
|
|
|
class TestCLITitlesOption:
|
|
"""测试 CLI 标题提取功能。"""
|
|
|
|
def test_titles_option(self, cli_runner, temp_docx):
|
|
"""测试 -t 选项提取标题。"""
|
|
file_path = temp_docx(
|
|
headings=[(1, "一级标题"), (2, "二级标题")],
|
|
paragraphs=["普通段落"]
|
|
)
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-t"])
|
|
|
|
assert exit_code == 0
|
|
# 输出应该包含标题
|
|
assert "一级标题" in stdout
|
|
assert "二级标题" in stdout
|
|
# 不应该包含普通段落
|
|
assert "普通段落" not in stdout
|
|
|
|
|
|
class TestCLITitleContentOption:
|
|
"""测试 CLI 标题内容提取功能。"""
|
|
|
|
def test_title_content_option(self, cli_runner, temp_docx):
|
|
"""测试 -tc 选项提取标题内容。"""
|
|
file_path = temp_docx(
|
|
headings=[(1, "目标标题")],
|
|
paragraphs=["标题下的内容"]
|
|
)
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-tc", "目标标题"])
|
|
|
|
assert exit_code == 0
|
|
assert "目标标题" in stdout
|
|
assert "标题下的内容" in stdout
|
|
|
|
def test_title_content_not_found(self, cli_runner, temp_docx):
|
|
"""测试标题不存在时的错误处理。"""
|
|
file_path = temp_docx(paragraphs=["测试内容"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-tc", "不存在的标题"])
|
|
|
|
assert exit_code != 0
|
|
# 应该输出错误信息
|
|
output = stdout + stderr
|
|
assert "未找到" in output or "不存在" in output or "错误" in output
|
|
|
|
|
|
class TestCLISearchOption:
|
|
"""测试 CLI 搜索功能。"""
|
|
|
|
def test_search_option(self, cli_runner, temp_docx):
|
|
"""测试 -s 选项搜索内容。"""
|
|
file_path = temp_docx(paragraphs=["包含关键词的段落", "其他内容"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-s", "关键词"])
|
|
|
|
assert exit_code == 0
|
|
assert "关键词" in stdout
|
|
|
|
def test_search_no_match(self, cli_runner, temp_docx):
|
|
"""测试搜索无匹配时的错误处理。"""
|
|
file_path = temp_docx(paragraphs=["测试内容"])
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-s", "不存在的内容"])
|
|
|
|
assert exit_code != 0
|
|
# 应该输出错误信息
|
|
output = stdout + stderr
|
|
assert "未找到" in output or "无匹配" in output or "错误" in output
|
|
|
|
def test_search_with_context(self, cli_runner, temp_docx):
|
|
"""测试 -n 选项设置上下文行数。"""
|
|
file_path = temp_docx(
|
|
paragraphs=["第一行", "第二行", "包含关键词的行", "第四行", "第五行"]
|
|
)
|
|
|
|
stdout, stderr, exit_code = cli_runner([file_path, "-s", "关键词", "-n", "2"])
|
|
|
|
assert exit_code == 0
|
|
assert "关键词" in stdout
|
|
# 应该包含上下文
|
|
assert "第二行" in stdout or "第四行" in stdout
|
|
|
|
|
|
class TestCLIErrorHandling:
|
|
"""测试 CLI 错误处理。"""
|
|
|
|
def test_file_not_exists(self, cli_runner, tmp_path):
|
|
"""测试文件不存在时的错误处理。"""
|
|
non_existent = str(tmp_path / "non_existent.docx")
|
|
|
|
stdout, stderr, exit_code = cli_runner([non_existent])
|
|
|
|
assert exit_code != 0
|
|
output = stdout + stderr
|
|
assert "错误" in output or "不存在" in output
|
|
|
|
def test_unsupported_format(self, cli_runner, tmp_path):
|
|
"""测试不支持的文件类型。"""
|
|
unsupported_file = tmp_path / "test.xyz"
|
|
unsupported_file.write_text("test content")
|
|
|
|
stdout, stderr, exit_code = cli_runner([str(unsupported_file)])
|
|
|
|
assert exit_code != 0
|
|
output = stdout + stderr
|
|
assert "reader" in output.lower() or "支持" in output
|
|
|
|
def test_all_readers_failed(self, cli_runner, tmp_path):
|
|
"""测试所有 Reader 失败时的错误输出。"""
|
|
# 创建一个看起来像 DOCX 但实际损坏的文件
|
|
fake_docx = tmp_path / "fake.docx"
|
|
fake_docx.write_bytes(b"not a real docx file")
|
|
|
|
stdout, stderr, exit_code = cli_runner([str(fake_docx)])
|
|
|
|
assert exit_code != 0
|
|
output = stdout + stderr
|
|
# 应该列出失败原因
|
|
assert "失败" in output or "错误" in output
|