Files
lyxy-document/tests/test_readers/test_docx/test_python_docx.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

142 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""测试 python-docx Reader 的解析功能。"""
import pytest
import os
from readers.docx import DocxReader
class TestPythonDocxReaderParse:
"""测试 python-docx Reader 的 parse 方法。"""
def test_normal_file(self, temp_docx):
"""测试正常 DOCX 文件解析。"""
# 创建包含多种内容的测试文件
file_path = temp_docx(
headings=[(1, "主标题"), (2, "子标题")],
paragraphs=["这是第一段内容。", "这是第二段内容。"],
table_data=[["列1", "列2"], ["数据1", "数据2"]],
list_items=["列表项1", "列表项2"]
)
reader = DocxReader()
content, failures = reader.parse(file_path)
# 验证解析成功
assert content is not None, f"解析失败: {failures}"
assert len(failures) == 0 or all("成功" in f or not f for f in failures)
# 验证关键内容存在
assert "主标题" in content
assert "子标题" in content
assert "第一段内容" in content
assert "第二段内容" in content
assert "列1" in content or "列2" in content # 表格内容
assert "列表项1" in content
def test_file_not_exists(self, tmp_path):
"""测试文件不存在的情况。"""
non_existent_file = str(tmp_path / "non_existent.docx")
reader = DocxReader()
content, failures = reader.parse(non_existent_file)
# 验证返回 None 和错误信息
assert content is None
assert len(failures) > 0
assert any("不存在" in f or "找不到" in f for f in failures)
def test_empty_file(self, temp_docx):
"""测试空 DOCX 文件。"""
# 创建没有任何内容的文件
file_path = temp_docx()
reader = DocxReader()
content, failures = reader.parse(file_path)
# 空文件应该返回 None 或空字符串
assert content is None or content.strip() == ""
def test_corrupted_file(self, temp_docx, tmp_path):
"""测试损坏的 DOCX 文件。"""
# 先创建正常文件
file_path = temp_docx(paragraphs=["测试内容"])
# 破坏文件内容 - 完全覆盖文件
with open(file_path, "wb") as f:
f.write(b"corrupted content that is not a valid docx file")
reader = DocxReader()
content, failures = reader.parse(file_path)
# 验证返回 None 和错误信息
assert content is None
assert len(failures) > 0
def test_special_chars(self, temp_docx):
"""测试特殊字符处理。"""
special_texts = [
"中文测试内容",
"Emoji测试: 😀🎉🚀",
"特殊符号: ©®™°±",
"混合内容: Hello你好🎉World世界",
"阿拉伯文: مرحبا", # RTL 文本
]
file_path = temp_docx(paragraphs=special_texts)
reader = DocxReader()
content, failures = reader.parse(file_path)
assert content is not None, f"解析失败: {failures}"
# 验证各种特殊字符都被正确处理
assert "中文测试内容" in content
assert "😀" in content or "🎉" in content # 至少包含一个 emoji
assert "©" in content or "®" in content # 至少包含一个特殊符号
assert "Hello你好" in content or "World世界" in content
class TestPythonDocxReaderSupports:
"""测试 python-docx Reader 的 supports 方法。"""
def test_supports_docx_extension(self):
"""测试识别 .docx 扩展名。"""
reader = DocxReader()
assert reader.supports("test.docx") is True
def test_supports_uppercase_extension(self):
"""测试识别大写扩展名。"""
reader = DocxReader()
assert reader.supports("TEST.DOCX") is True
def test_supports_doc_extension(self):
"""测试 .doc 扩展名(某些 Reader 可能不支持)。"""
reader = DocxReader()
# python-docx Reader 只支持 .docx
result = reader.supports("test.doc")
# 根据实际实现,可能返回 True 或 False
def test_rejects_unsupported_format(self):
"""测试拒绝不支持的格式。"""
reader = DocxReader()
assert reader.supports("test.pdf") is False
assert reader.supports("test.txt") is False
def test_supports_url(self):
"""测试 URL 路径。"""
reader = DocxReader()
# 根据实际实现URL 可能被支持或不支持
result = reader.supports("http://example.com/file.docx")
# 这里不做断言,因为不同 Reader 实现可能不同
def test_supports_path_with_spaces(self):
"""测试包含空格的路径。"""
reader = DocxReader()
assert reader.supports("path with spaces/test.docx") is True
def test_supports_absolute_path(self):
"""测试绝对路径。"""
reader = DocxReader()
assert reader.supports("/absolute/path/test.docx") is True
assert reader.supports("C:\\Windows\\path\\test.docx") is True