refactor: 重构 Reader 内部工具函数到独立模块
- 新增 scripts/readers/_utils.py 作为 Reader 内部共享工具模块 - 将 parse_with_markitdown 等函数从 core/markdown.py 迁移到 _utils.py - 函数重命名:parse_with_xxx → parse_via_xxx,_unstructured_elements_to_markdown → convert_unstructured_to_markdown - 更新 17 个 Reader 实现文件的 import 路径 - 从 core/__init__.py 移除已迁移函数的导出 - 新增测试文件 tests/test_readers/test_utils.py - 新增 spec 文档 openspec/specs/reader-internal-utils/spec.md 这次重构明确了模块边界:core/ 提供公共 API,readers/_utils.py 提供 Reader 内部工具
This commit is contained in:
203
tests/test_readers/test_utils.py
Normal file
203
tests/test_readers/test_utils.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""测试 Reader 内部工具函数。"""
|
||||
|
||||
import zipfile
|
||||
import pytest
|
||||
from scripts.readers._utils import (
|
||||
parse_via_markitdown,
|
||||
parse_via_docling,
|
||||
build_markdown_table,
|
||||
flush_list_stack,
|
||||
safe_open_zip,
|
||||
convert_unstructured_to_markdown,
|
||||
_UNSTRUCTURED_RGB_PATTERN,
|
||||
_UNSTRUCTURED_PAGE_NUMBER_PATTERN,
|
||||
)
|
||||
|
||||
|
||||
class TestBuildMarkdownTable:
|
||||
"""测试 build_markdown_table 函数。"""
|
||||
|
||||
def test_standard_table(self):
|
||||
"""测试标准表格格式化。"""
|
||||
rows_data = [["姓名", "年龄"], ["张三", "25"], ["李四", "30"]]
|
||||
result = build_markdown_table(rows_data)
|
||||
|
||||
assert "| 姓名 | 年龄 |" in result
|
||||
assert "| --- | --- |" in result
|
||||
assert "| 张三 | 25 |" in result
|
||||
assert "| 李四 | 30 |" in result
|
||||
|
||||
def test_empty_table(self):
|
||||
"""测试空表格。"""
|
||||
assert build_markdown_table([]) == ""
|
||||
assert build_markdown_table([[]]) == ""
|
||||
|
||||
def test_table_with_empty_cells(self):
|
||||
"""测试包含空单元格的表格。"""
|
||||
rows_data = [["A", "B"], ["", "C"], ["D", ""]]
|
||||
result = build_markdown_table(rows_data)
|
||||
|
||||
assert "| A | B |" in result
|
||||
assert "| | C |" in result
|
||||
assert "| D | |" in result
|
||||
|
||||
|
||||
class TestFlushListStack:
|
||||
"""测试 flush_list_stack 函数。"""
|
||||
|
||||
def test_flush_non_empty_items(self):
|
||||
"""测试刷新非空堆栈。"""
|
||||
list_stack = ["item1\n", "", "item2\n"]
|
||||
target = []
|
||||
|
||||
flush_list_stack(list_stack, target)
|
||||
|
||||
assert target == ["item1\n\n", "item2\n\n"]
|
||||
assert list_stack == []
|
||||
|
||||
def test_flush_all_empty(self):
|
||||
"""测试刷新空堆栈。"""
|
||||
list_stack = ["", "", ""]
|
||||
target = []
|
||||
|
||||
flush_list_stack(list_stack, target)
|
||||
|
||||
assert target == []
|
||||
assert list_stack == []
|
||||
|
||||
|
||||
class TestSafeOpenZip:
|
||||
"""测试 safe_open_zip 函数。"""
|
||||
|
||||
def test_open_valid_file(self, tmp_path):
|
||||
"""测试打开合法文件。"""
|
||||
# 创建测试 ZIP 文件
|
||||
zip_path = tmp_path / "test.zip"
|
||||
with zipfile.ZipFile(zip_path, "w") as zf:
|
||||
zf.writestr("valid.txt", "content")
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
result = safe_open_zip(zf, "valid.txt")
|
||||
assert result is not None
|
||||
assert result.read() == b"content"
|
||||
|
||||
def test_reject_path_traversal(self, tmp_path):
|
||||
"""测试拒绝路径遍历攻击。"""
|
||||
zip_path = tmp_path / "test.zip"
|
||||
with zipfile.ZipFile(zip_path, "w") as zf:
|
||||
zf.writestr("safe.txt", "content")
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
assert safe_open_zip(zf, "../etc/passwd") is None
|
||||
assert safe_open_zip(zf, "sub/../../etc/passwd") is None
|
||||
|
||||
def test_reject_absolute_path(self, tmp_path):
|
||||
"""测试拒绝绝对路径。"""
|
||||
zip_path = tmp_path / "test.zip"
|
||||
with zipfile.ZipFile(zip_path, "w") as zf:
|
||||
zf.writestr("safe.txt", "content")
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
assert safe_open_zip(zf, "/absolute/path.txt") is None
|
||||
assert safe_open_zip(zf, "C:\\Windows\\System32\\config") is None
|
||||
|
||||
def test_empty_name(self):
|
||||
"""测试空文件名。"""
|
||||
import io
|
||||
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, "w") as zf:
|
||||
zf.writestr("test.txt", "content")
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_buffer.getvalue()), "r") as zf:
|
||||
assert safe_open_zip(zf, "") is None
|
||||
|
||||
|
||||
class TestUnstructuredPatterns:
|
||||
"""测试 unstructured 噪声匹配模式。"""
|
||||
|
||||
def test_rgb_pattern(self):
|
||||
"""测试 RGB 颜色值模式。"""
|
||||
assert _UNSTRUCTURED_RGB_PATTERN.match("R:255 G:128 B:0")
|
||||
assert _UNSTRUCTURED_RGB_PATTERN.match("R:0 G:0 B:0")
|
||||
assert _UNSTRUCTURED_RGB_PATTERN.match("R:255 G:255 B:255")
|
||||
|
||||
def test_rgb_pattern_invalid(self):
|
||||
"""测试无效 RGB 值。"""
|
||||
assert not _UNSTRUCTURED_RGB_PATTERN.match("255 128 0")
|
||||
assert not _UNSTRUCTURED_RGB_PATTERN.match("RGB:255 G:128 B:0")
|
||||
|
||||
def test_page_number_pattern(self):
|
||||
"""测试页码模式。"""
|
||||
assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 3 —")
|
||||
assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 123 —")
|
||||
assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 1 —")
|
||||
|
||||
def test_page_number_pattern_invalid(self):
|
||||
"""测试无效页码。"""
|
||||
assert not _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("Page 3")
|
||||
assert not _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("--- 3 ---")
|
||||
|
||||
|
||||
class TestConvertUnstructuredToMarkdown:
|
||||
"""测试 convert_unstructured_to_markdown 函数。"""
|
||||
|
||||
def test_skip_rgb_pattern(self):
|
||||
"""测试跳过 RGB 噪声。"""
|
||||
try:
|
||||
from unstructured.documents.elements import Text
|
||||
except ImportError:
|
||||
pytest.skip("unstructured 库未安装")
|
||||
|
||||
elements = [Text("R:255 G:128 B:0"), Text("正常文本")]
|
||||
result = convert_unstructured_to_markdown(elements)
|
||||
|
||||
assert "R:255 G:128 B:0" not in result
|
||||
assert "正常文本" in result
|
||||
|
||||
def test_skip_page_number_pattern(self):
|
||||
"""测试跳过页码噪声。"""
|
||||
try:
|
||||
from unstructured.documents.elements import Text
|
||||
except ImportError:
|
||||
pytest.skip("unstructured 库未安装")
|
||||
|
||||
elements = [Text("— 3 —"), Text("正常文本")]
|
||||
result = convert_unstructured_to_markdown(elements)
|
||||
|
||||
assert "— 3 —" not in result
|
||||
assert "正常文本" in result
|
||||
|
||||
def test_convert_without_markdownify(self):
|
||||
"""测试未安装 markdownify 时的回退行为。"""
|
||||
# 创建简单的 mock 对象
|
||||
class MockElement:
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
|
||||
elements = [MockElement("文本1"), MockElement("文本2")]
|
||||
result = convert_unstructured_to_markdown(elements)
|
||||
|
||||
# 应该回退到简单连接文本
|
||||
assert "文本1" in result
|
||||
assert "文本2" in result
|
||||
|
||||
|
||||
class TestParseViaMarkitdown:
|
||||
"""测试 parse_via_markitdown 函数。"""
|
||||
|
||||
def test_parse_nonexistent_file(self):
|
||||
"""测试解析不存在的文件。"""
|
||||
content, error = parse_via_markitdown("/nonexistent/file.txt")
|
||||
assert content is None
|
||||
assert error is not None
|
||||
|
||||
|
||||
class TestParseViaDocling:
|
||||
"""测试 parse_via_docling 函数。"""
|
||||
|
||||
def test_parse_nonexistent_file(self):
|
||||
"""测试解析不存在的文件。"""
|
||||
content, error = parse_via_docling("/nonexistent/file.txt")
|
||||
assert content is None
|
||||
assert error is not None
|
||||
Reference in New Issue
Block a user