"""测试 Reader 内部工具函数。""" import zipfile import pytest from scripts.readers._utils import ( parse_via_markitdown, parse_via_docling, build_markdown_table, flush_list_stack, safe_open_zip, convert_unstructured_to_markdown, _UNSTRUCTURED_RGB_PATTERN, _UNSTRUCTURED_PAGE_NUMBER_PATTERN, ) class TestBuildMarkdownTable: """测试 build_markdown_table 函数。""" def test_standard_table(self): """测试标准表格格式化。""" rows_data = [["姓名", "年龄"], ["张三", "25"], ["李四", "30"]] result = build_markdown_table(rows_data) assert "| 姓名 | 年龄 |" in result assert "| --- | --- |" in result assert "| 张三 | 25 |" in result assert "| 李四 | 30 |" in result def test_empty_table(self): """测试空表格。""" assert build_markdown_table([]) == "" assert build_markdown_table([[]]) == "" def test_table_with_empty_cells(self): """测试包含空单元格的表格。""" rows_data = [["A", "B"], ["", "C"], ["D", ""]] result = build_markdown_table(rows_data) assert "| A | B |" in result assert "| | C |" in result assert "| D | |" in result class TestFlushListStack: """测试 flush_list_stack 函数。""" def test_flush_non_empty_items(self): """测试刷新非空堆栈。""" list_stack = ["item1\n", "", "item2\n"] target = [] flush_list_stack(list_stack, target) assert target == ["item1\n\n", "item2\n\n"] assert list_stack == [] def test_flush_all_empty(self): """测试刷新空堆栈。""" list_stack = ["", "", ""] target = [] flush_list_stack(list_stack, target) assert target == [] assert list_stack == [] class TestSafeOpenZip: """测试 safe_open_zip 函数。""" def test_open_valid_file(self, tmp_path): """测试打开合法文件。""" # 创建测试 ZIP 文件 zip_path = tmp_path / "test.zip" with zipfile.ZipFile(zip_path, "w") as zf: zf.writestr("valid.txt", "content") with zipfile.ZipFile(zip_path, "r") as zf: result = safe_open_zip(zf, "valid.txt") assert result is not None assert result.read() == b"content" def test_reject_path_traversal(self, tmp_path): """测试拒绝路径遍历攻击。""" zip_path = tmp_path / "test.zip" with zipfile.ZipFile(zip_path, "w") as zf: zf.writestr("safe.txt", "content") with zipfile.ZipFile(zip_path, "r") as zf: assert safe_open_zip(zf, "../etc/passwd") is None assert safe_open_zip(zf, "sub/../../etc/passwd") is None def test_reject_absolute_path(self, tmp_path): """测试拒绝绝对路径。""" zip_path = tmp_path / "test.zip" with zipfile.ZipFile(zip_path, "w") as zf: zf.writestr("safe.txt", "content") with zipfile.ZipFile(zip_path, "r") as zf: assert safe_open_zip(zf, "/absolute/path.txt") is None assert safe_open_zip(zf, "C:\\Windows\\System32\\config") is None def test_empty_name(self): """测试空文件名。""" import io zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "w") as zf: zf.writestr("test.txt", "content") with zipfile.ZipFile(io.BytesIO(zip_buffer.getvalue()), "r") as zf: assert safe_open_zip(zf, "") is None class TestUnstructuredPatterns: """测试 unstructured 噪声匹配模式。""" def test_rgb_pattern(self): """测试 RGB 颜色值模式。""" assert _UNSTRUCTURED_RGB_PATTERN.match("R:255 G:128 B:0") assert _UNSTRUCTURED_RGB_PATTERN.match("R:0 G:0 B:0") assert _UNSTRUCTURED_RGB_PATTERN.match("R:255 G:255 B:255") def test_rgb_pattern_invalid(self): """测试无效 RGB 值。""" assert not _UNSTRUCTURED_RGB_PATTERN.match("255 128 0") assert not _UNSTRUCTURED_RGB_PATTERN.match("RGB:255 G:128 B:0") def test_page_number_pattern(self): """测试页码模式。""" assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 3 —") assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 123 —") assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 1 —") def test_page_number_pattern_invalid(self): """测试无效页码。""" assert not _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("Page 3") assert not _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("--- 3 ---") class TestConvertUnstructuredToMarkdown: """测试 convert_unstructured_to_markdown 函数。""" def test_skip_rgb_pattern(self): """测试跳过 RGB 噪声。""" try: from unstructured.documents.elements import Text except ImportError: pytest.skip("unstructured 库未安装") elements = [Text("R:255 G:128 B:0"), Text("正常文本")] result = convert_unstructured_to_markdown(elements) assert "R:255 G:128 B:0" not in result assert "正常文本" in result def test_skip_page_number_pattern(self): """测试跳过页码噪声。""" try: from unstructured.documents.elements import Text except ImportError: pytest.skip("unstructured 库未安装") elements = [Text("— 3 —"), Text("正常文本")] result = convert_unstructured_to_markdown(elements) assert "— 3 —" not in result assert "正常文本" in result def test_convert_without_markdownify(self): """测试未安装 markdownify 时的回退行为。""" # 创建简单的 mock 对象 class MockElement: def __init__(self, text): self.text = text elements = [MockElement("文本1"), MockElement("文本2")] result = convert_unstructured_to_markdown(elements) # 应该回退到简单连接文本 assert "文本1" in result assert "文本2" in result class TestParseViaMarkitdown: """测试 parse_via_markitdown 函数。""" def test_parse_nonexistent_file(self): """测试解析不存在的文件。""" content, error = parse_via_markitdown("/nonexistent/file.txt") assert content is None assert error is not None class TestParseViaDocling: """测试 parse_via_docling 函数。""" def test_parse_nonexistent_file(self): """测试解析不存在的文件。""" content, error = parse_via_docling("/nonexistent/file.txt") assert content is None assert error is not None