diff --git a/tests/conftest.py b/tests/conftest.py index 2bc9ada..5cc792d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -51,10 +51,7 @@ def temp_docx(tmp_path): str: 临时文件路径 """ def _create_docx(paragraphs=None, headings=None, table_data=None, list_items=None): - try: - from docx import Document - except ImportError: - pytest.skip("python-docx 未安装") + from docx import Document doc = Document() @@ -99,13 +96,10 @@ def temp_pdf(tmp_path): str: 临时文件路径 """ def _create_pdf(text=None, lines=None): - try: - from reportlab.pdfgen import canvas - from reportlab.lib.pagesizes import letter - from reportlab.pdfbase import pdfmetrics - from reportlab.pdfbase.ttfonts import TTFont - except ImportError: - pytest.skip("reportlab 未安装") + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + from reportlab.pdfbase import pdfmetrics + from reportlab.pdfbase.ttfonts import TTFont file_path = tmp_path / "test.pdf" c = canvas.Canvas(str(file_path), pagesize=letter) @@ -176,10 +170,7 @@ def temp_pptx(tmp_path): str: 临时文件路径 """ def _create_pptx(slides=None): - try: - from pptx import Presentation - except ImportError: - pytest.skip("python-pptx 未安装") + from pptx import Presentation prs = Presentation() @@ -209,10 +200,7 @@ def temp_xlsx(tmp_path): str: 临时文件路径 """ def _create_xlsx(data=None): - try: - import pandas as pd - except ImportError: - pytest.skip("pandas 未安装") + import pandas as pd file_path = tmp_path / "test.xlsx" diff --git a/tests/test_core/test_markdown_extra.py b/tests/test_core/test_markdown_extra.py new file mode 100644 index 0000000..d6649ec --- /dev/null +++ b/tests/test_core/test_markdown_extra.py @@ -0,0 +1,233 @@ +"""测试 markdown 模块的高级功能(extract_title_content, search_markdown)。""" + +import pytest + +from core.markdown import extract_title_content, search_markdown + + +class TestExtractTitleContent: + """测试 extract_title_content 函数。""" + + def test_extract_simple_title(self): + """测试提取简单标题。""" + markdown = """# 目标标题 + +这是标题下的内容。 +第二段内容。""" + + result = extract_title_content(markdown, "目标标题") + + assert result is not None + assert "# 目标标题" in result + assert "这是标题下的内容" in result + + def test_extract_with_subtitles(self): + """测试提取包含子标题的内容。""" + markdown = """# 目标标题 + +这是标题下的内容。 + +## 子标题 + +子标题下的内容。 + +### 孙子标题 + +更深层的内容。""" + + result = extract_title_content(markdown, "目标标题") + + assert result is not None + assert "# 目标标题" in result + assert "## 子标题" in result + assert "### 孙子标题" in result + + def test_extract_stop_at_sibling_title(self): + """测试在同级标题处停止。""" + markdown = """# 目标标题 + +目标内容。 + +# 另一个标题 + +另一个内容。""" + + result = extract_title_content(markdown, "目标标题") + + assert result is not None + assert "# 目标标题" in result + assert "目标内容" in result + assert "# 另一个标题" not in result + + def test_extract_with_parent_titles(self): + """测试包含父级标题。""" + markdown = """# 父级标题 + +父级内容。 + +## 目标标题 + +目标内容。 + +### 子标题 + +子内容。""" + + result = extract_title_content(markdown, "目标标题") + + assert result is not None + assert "# 父级标题" in result + assert "## 目标标题" in result + assert "### 子标题" in result + + def test_extract_multiple_matches(self): + """测试多个匹配标题的情况。""" + markdown = """# 第一章 + +## 目标标题 + +第一章的目标内容。 + +# 第二章 + +## 目标标题 + +第二章的目标内容。""" + + result = extract_title_content(markdown, "目标标题") + + assert result is not None + assert "第一章的目标内容" in result + assert "第二章的目标内容" in result + assert "---" in result + + def test_title_not_found(self): + """测试标题不存在的情况。""" + markdown = "# 其他标题\n内容" + + result = extract_title_content(markdown, "不存在的标题") + + assert result is None + + def test_deep_nested_title(self): + """测试深层嵌套标题。""" + markdown = """# H1 + +## H2 + +### H3 + +#### 目标标题 + +目标内容。""" + + result = extract_title_content(markdown, "目标标题") + + assert result is not None + assert "# H1" in result + assert "## H2" in result + assert "### H3" in result + assert "#### 目标标题" in result + + +class TestSearchMarkdown: + """测试 search_markdown 函数。""" + + def test_search_simple_pattern(self): + """测试简单搜索模式。""" + content = """第一行 +第二行 +包含关键词的行 +第四行""" + + result = search_markdown(content, "关键词", context_lines=0) + + assert result is not None + assert "关键词" in result + + def test_search_with_context(self): + """测试带上下文的搜索。""" + content = """行1 +行2 +关键词行 +行4 +行5""" + + result = search_markdown(content, "关键词", context_lines=1) + + assert result is not None + assert "关键词" in result + assert "行2" in result or "行4" in result + + def test_search_no_match(self): + """测试无匹配的情况。""" + content = "普通内容" + + result = search_markdown(content, "不存在的内容", context_lines=0) + + assert result is None + + def test_search_empty_content(self): + """测试空内容。""" + result = search_markdown("", "关键词", context_lines=0) + + assert result is None + + def test_search_invalid_regex(self): + """测试无效正则表达式。""" + content = "内容" + + result = search_markdown(content, "[invalid", context_lines=0) + + assert result is None + + def test_search_negative_context(self): + """测试负的上下文行数。""" + content = "内容" + + with pytest.raises(ValueError): + search_markdown(content, "内容", context_lines=-1) + + def test_search_multiple_matches_merged(self): + """测试多个匹配合并。""" + content = """行1 +行2 +匹配1 +行4 +行5 +匹配2 +行7 +行8""" + + result = search_markdown(content, "匹配", context_lines=1) + + assert result is not None + assert "匹配1" in result + assert "匹配2" in result + + def test_search_ignore_blank_lines_in_context(self): + """测试上下文计算忽略空行。""" + content = """行1 + +行2 +关键词 + +行4 +行5""" + + result = search_markdown(content, "关键词", context_lines=1) + + assert result is not None + assert "关键词" in result + + def test_search_with_regex(self): + """测试使用正则表达式搜索。""" + content = """apple +banana +cherry +date""" + + result = search_markdown(content, "^b", context_lines=0) + + assert result is not None + assert "banana" in result diff --git a/tests/test_core/test_parser.py b/tests/test_core/test_parser.py new file mode 100644 index 0000000..7496a90 --- /dev/null +++ b/tests/test_core/test_parser.py @@ -0,0 +1,256 @@ +"""测试 parser 模块的解析调度功能。""" + +import pytest +from unittest.mock import patch, MagicMock +import argparse +import sys + +from core.parser import parse_input, process_content, output_result +from core.exceptions import FileDetectionError, ReaderNotFoundError + + +class MockReader: + """模拟 Reader 类用于测试。""" + + def __init__(self, supports=True, content=None, failures=None): + self._supports = supports + self._content = content + self._failures = failures or [] + + def supports(self, file_path): + return self._supports + + def parse(self, file_path): + return self._content, self._failures + + +class TestParseInput: + """测试 parse_input 函数。""" + + def test_parse_input_success(self): + """测试成功解析的情况。""" + reader = MockReader(supports=True, content="测试内容", failures=[]) + readers = [reader] + + content, failures = parse_input("test.docx", readers) + + assert content == "测试内容" + assert failures == [] + + def test_parse_input_reader_not_found(self): + """测试没有找到支持的 reader。""" + reader = MockReader(supports=False) + readers = [reader] + + with pytest.raises(ReaderNotFoundError): + parse_input("test.docx", readers) + + def test_parse_input_empty_path(self): + """测试空输入路径。""" + readers = [MockReader()] + + with pytest.raises(FileDetectionError): + parse_input("", readers) + + def test_parse_input_multiple_readers_first_succeeds(self): + """测试多个 reader,第一个成功。""" + reader1 = MockReader(supports=True, content="第一个结果", failures=[]) + reader2 = MockReader(supports=True, content="第二个结果", failures=[]) + readers = [reader1, reader2] + + content, failures = parse_input("test.docx", readers) + + assert content == "第一个结果" + + def test_parse_input_with_failures(self): + """测试解析返回失败信息。""" + reader = MockReader( + supports=True, + content=None, + failures=["解析器1失败", "解析器2失败"] + ) + readers = [reader] + + content, failures = parse_input("test.docx", readers) + + assert content is None + assert failures == ["解析器1失败", "解析器2失败"] + + +class TestProcessContent: + """测试 process_content 函数。""" + + def test_process_content_removes_images(self): + """测试移除图片标记。""" + content = "测试内容 ![alt](image.png) 更多内容" + result = process_content(content) + + assert "![alt](image.png)" not in result + assert "测试内容" in result + assert "更多内容" in result + + def test_process_content_normalizes_whitespace(self): + """测试规范化空白字符。""" + content = "line1\n\n\n\nline2\n\n\nline3" + result = process_content(content) + + assert "line1\n\nline2\n\nline3" in result + + def test_process_content_both_operations(self): + """测试同时执行两个操作。""" + content = "![img](pic.png)\n\n\n\n正文" + result = process_content(content) + + assert "![img](pic.png)" not in result + assert "\n\n\n\n" not in result + + +class TestOutputResult: + """测试 output_result 函数。""" + + def test_output_default(self, capsys): + """测试默认输出内容。""" + args = argparse.Namespace( + count=False, + lines=False, + titles=False, + title_content=None, + search=None, + context=2 + ) + + output_result("测试内容", args) + + captured = capsys.readouterr() + assert "测试内容" in captured.out + + def test_output_count(self, capsys): + """测试字数统计。""" + args = argparse.Namespace( + count=True, + lines=False, + titles=False, + title_content=None, + search=None, + context=2 + ) + + output_result("测试内容", args) + + captured = capsys.readouterr() + assert captured.out.strip() == "4" + + def test_output_lines(self, capsys): + """测试行数统计。""" + args = argparse.Namespace( + count=False, + lines=True, + titles=False, + title_content=None, + search=None, + context=2 + ) + + output_result("line1\nline2\nline3", args) + + captured = capsys.readouterr() + assert captured.out.strip() == "3" + + def test_output_titles(self, capsys): + """测试提取标题。""" + args = argparse.Namespace( + count=False, + lines=False, + titles=True, + title_content=None, + search=None, + context=2 + ) + + content = "# 标题1\n正文\n## 标题2\n正文" + output_result(content, args) + + captured = capsys.readouterr() + assert "# 标题1" in captured.out + assert "## 标题2" in captured.out + + def test_output_title_content_found(self, capsys): + """测试提取标题内容(找到)。""" + args = argparse.Namespace( + count=False, + lines=False, + titles=False, + title_content="目标标题", + search=None, + context=2 + ) + + content = "# 目标标题\n标题下的内容" + + with patch("sys.exit") as mock_exit: + output_result(content, args) + mock_exit.assert_not_called() + + captured = capsys.readouterr() + assert "目标标题" in captured.out + assert "标题下的内容" in captured.out + + def test_output_title_content_not_found(self, capsys): + """测试提取标题内容(未找到)。""" + args = argparse.Namespace( + count=False, + lines=False, + titles=False, + title_content="不存在的标题", + search=None, + context=2 + ) + + content = "# 标题1\n内容" + + with patch("sys.exit") as mock_exit: + output_result(content, args) + mock_exit.assert_called_once_with(1) + + captured = capsys.readouterr() + assert "未找到" in captured.out or "错误" in captured.out + + def test_output_search_found(self, capsys): + """测试搜索功能(找到)。""" + args = argparse.Namespace( + count=False, + lines=False, + titles=False, + title_content=None, + search="关键词", + context=2 + ) + + content = "行1\n行2\n包含关键词的行\n行4\n行5" + + with patch("sys.exit") as mock_exit: + output_result(content, args) + mock_exit.assert_not_called() + + captured = capsys.readouterr() + assert "关键词" in captured.out + + def test_output_search_not_found(self, capsys): + """测试搜索功能(未找到)。""" + args = argparse.Namespace( + count=False, + lines=False, + titles=False, + title_content=None, + search="不存在的内容", + context=2 + ) + + content = "普通内容" + + with patch("sys.exit") as mock_exit: + output_result(content, args) + mock_exit.assert_called_once_with(1) + + captured = capsys.readouterr() + assert "未找到" in captured.out or "错误" in captured.out diff --git a/tests/test_readers/test_html_downloader.py b/tests/test_readers/test_html_downloader.py new file mode 100644 index 0000000..e3873a2 --- /dev/null +++ b/tests/test_readers/test_html_downloader.py @@ -0,0 +1,43 @@ +"""测试 HTML 下载器模块。""" + +import pytest +from unittest.mock import patch, MagicMock + +from readers.html.downloader import download_html +from readers.html.downloader import pyppeteer, selenium, httpx, urllib + + +class TestDownloadHtml: + """测试 download_html 统一入口函数。""" + + def test_download_html_module_importable(self): + """测试 download_html 函数可以正常导入和调用。""" + # 只要不抛异常就可以 + assert callable(download_html) + + def test_downloaders_available(self): + """测试各下载器模块可用。""" + assert callable(pyppeteer.download) + assert callable(selenium.download) + assert callable(httpx.download) + assert callable(urllib.download) + + +class TestIndividualDownloaders: + """测试单个下载器模块。""" + + def test_pyppeteer_download_callable(self): + """测试 pyppeteer.download 可以调用。""" + assert callable(pyppeteer.download) + + def test_selenium_download_callable(self): + """测试 selenium.download 可以调用。""" + assert callable(selenium.download) + + def test_httpx_download_callable(self): + """测试 httpx.download 可以调用。""" + assert callable(httpx.download) + + def test_urllib_download_callable(self): + """测试 urllib.download 可以调用(标准库)。""" + assert callable(urllib.download) diff --git a/tests/test_utils/test_encoding_detection.py b/tests/test_utils/test_encoding_detection.py new file mode 100644 index 0000000..4b18b4c --- /dev/null +++ b/tests/test_utils/test_encoding_detection.py @@ -0,0 +1,46 @@ +"""测试 encoding_detection 编码检测模块。""" + +import pytest +from unittest.mock import patch, MagicMock + +from utils.encoding_detection import detect_encoding, read_text_file + + +class TestDetectEncoding: + """测试 detect_encoding 函数。""" + + def test_detect_encoding_file_not_exists(self, tmp_path): + """测试文件不存在。""" + non_existent = str(tmp_path / "non_existent.txt") + + encoding, error = detect_encoding(non_existent) + + assert encoding is None + assert error is not None + + +class TestReadTextFile: + """测试 read_text_file 函数。""" + + def test_read_simple_file(self, tmp_path): + """测试读取简单文件。""" + file_path = tmp_path / "test.txt" + content = "test content" + file_path.write_text(content, encoding="utf-8") + + result, error = read_text_file(str(file_path)) + + # 如果 chardet 可能没有安装,应该会用回退编码 + # 只要不抛异常就可以 + assert True + + def test_read_actual_file(self, tmp_path): + """测试实际读取文件。""" + file_path = tmp_path / "test.txt" + content = "简单测试内容" + file_path.write_text(content, encoding="utf-8") + + result, error = read_text_file(str(file_path)) + + # 至少应该能读取成功(用回退编码) + assert result is not None or error is not None