Files
lyxy-document/tests/test_readers/test_utils.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

204 lines
6.6 KiB
Python

"""测试 Reader 内部工具函数。"""
import zipfile
import pytest
from readers._utils import (
parse_via_markitdown,
parse_via_docling,
build_markdown_table,
flush_list_stack,
safe_open_zip,
convert_unstructured_to_markdown,
_UNSTRUCTURED_RGB_PATTERN,
_UNSTRUCTURED_PAGE_NUMBER_PATTERN,
)
class TestBuildMarkdownTable:
"""测试 build_markdown_table 函数。"""
def test_standard_table(self):
"""测试标准表格格式化。"""
rows_data = [["姓名", "年龄"], ["张三", "25"], ["李四", "30"]]
result = build_markdown_table(rows_data)
assert "| 姓名 | 年龄 |" in result
assert "| --- | --- |" in result
assert "| 张三 | 25 |" in result
assert "| 李四 | 30 |" in result
def test_empty_table(self):
"""测试空表格。"""
assert build_markdown_table([]) == ""
assert build_markdown_table([[]]) == ""
def test_table_with_empty_cells(self):
"""测试包含空单元格的表格。"""
rows_data = [["A", "B"], ["", "C"], ["D", ""]]
result = build_markdown_table(rows_data)
assert "| A | B |" in result
assert "| | C |" in result
assert "| D | |" in result
class TestFlushListStack:
"""测试 flush_list_stack 函数。"""
def test_flush_non_empty_items(self):
"""测试刷新非空堆栈。"""
list_stack = ["item1\n", "", "item2\n"]
target = []
flush_list_stack(list_stack, target)
assert target == ["item1\n\n", "item2\n\n"]
assert list_stack == []
def test_flush_all_empty(self):
"""测试刷新空堆栈。"""
list_stack = ["", "", ""]
target = []
flush_list_stack(list_stack, target)
assert target == []
assert list_stack == []
class TestSafeOpenZip:
"""测试 safe_open_zip 函数。"""
def test_open_valid_file(self, tmp_path):
"""测试打开合法文件。"""
# 创建测试 ZIP 文件
zip_path = tmp_path / "test.zip"
with zipfile.ZipFile(zip_path, "w") as zf:
zf.writestr("valid.txt", "content")
with zipfile.ZipFile(zip_path, "r") as zf:
result = safe_open_zip(zf, "valid.txt")
assert result is not None
assert result.read() == b"content"
def test_reject_path_traversal(self, tmp_path):
"""测试拒绝路径遍历攻击。"""
zip_path = tmp_path / "test.zip"
with zipfile.ZipFile(zip_path, "w") as zf:
zf.writestr("safe.txt", "content")
with zipfile.ZipFile(zip_path, "r") as zf:
assert safe_open_zip(zf, "../etc/passwd") is None
assert safe_open_zip(zf, "sub/../../etc/passwd") is None
def test_reject_absolute_path(self, tmp_path):
"""测试拒绝绝对路径。"""
zip_path = tmp_path / "test.zip"
with zipfile.ZipFile(zip_path, "w") as zf:
zf.writestr("safe.txt", "content")
with zipfile.ZipFile(zip_path, "r") as zf:
assert safe_open_zip(zf, "/absolute/path.txt") is None
assert safe_open_zip(zf, "C:\\Windows\\System32\\config") is None
def test_empty_name(self):
"""测试空文件名。"""
import io
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as zf:
zf.writestr("test.txt", "content")
with zipfile.ZipFile(io.BytesIO(zip_buffer.getvalue()), "r") as zf:
assert safe_open_zip(zf, "") is None
class TestUnstructuredPatterns:
"""测试 unstructured 噪声匹配模式。"""
def test_rgb_pattern(self):
"""测试 RGB 颜色值模式。"""
assert _UNSTRUCTURED_RGB_PATTERN.match("R:255 G:128 B:0")
assert _UNSTRUCTURED_RGB_PATTERN.match("R:0 G:0 B:0")
assert _UNSTRUCTURED_RGB_PATTERN.match("R:255 G:255 B:255")
def test_rgb_pattern_invalid(self):
"""测试无效 RGB 值。"""
assert not _UNSTRUCTURED_RGB_PATTERN.match("255 128 0")
assert not _UNSTRUCTURED_RGB_PATTERN.match("RGB:255 G:128 B:0")
def test_page_number_pattern(self):
"""测试页码模式。"""
assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 3 —")
assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 123 —")
assert _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("— 1 —")
def test_page_number_pattern_invalid(self):
"""测试无效页码。"""
assert not _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("Page 3")
assert not _UNSTRUCTURED_PAGE_NUMBER_PATTERN.match("--- 3 ---")
class TestConvertUnstructuredToMarkdown:
"""测试 convert_unstructured_to_markdown 函数。"""
def test_skip_rgb_pattern(self):
"""测试跳过 RGB 噪声。"""
try:
from unstructured.documents.elements import Text
except ImportError:
pytest.skip("unstructured 库未安装")
elements = [Text("R:255 G:128 B:0"), Text("正常文本")]
result = convert_unstructured_to_markdown(elements)
assert "R:255 G:128 B:0" not in result
assert "正常文本" in result
def test_skip_page_number_pattern(self):
"""测试跳过页码噪声。"""
try:
from unstructured.documents.elements import Text
except ImportError:
pytest.skip("unstructured 库未安装")
elements = [Text("— 3 —"), Text("正常文本")]
result = convert_unstructured_to_markdown(elements)
assert "— 3 —" not in result
assert "正常文本" in result
def test_convert_without_markdownify(self):
"""测试未安装 markdownify 时的回退行为。"""
# 创建简单的 mock 对象
class MockElement:
def __init__(self, text):
self.text = text
elements = [MockElement("文本1"), MockElement("文本2")]
result = convert_unstructured_to_markdown(elements)
# 应该回退到简单连接文本
assert "文本1" in result
assert "文本2" in result
class TestParseViaMarkitdown:
"""测试 parse_via_markitdown 函数。"""
def test_parse_nonexistent_file(self):
"""测试解析不存在的文件。"""
content, error = parse_via_markitdown("/nonexistent/file.txt")
assert content is None
assert error is not None
class TestParseViaDocling:
"""测试 parse_via_docling 函数。"""
def test_parse_nonexistent_file(self):
"""测试解析不存在的文件。"""
content, error = parse_via_docling("/nonexistent/file.txt")
assert content is None
assert error is not None