Files
lyxy-document/tests/test_readers/test_docx/test_markitdown_docx.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

80 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""测试 MarkItDown DOCX Reader 的解析功能。"""
import pytest
import os
from readers.docx import markitdown
class TestMarkitdownDocxReaderParse:
"""测试 MarkItDown DOCX Reader 的 parse 方法。"""
def test_normal_file(self, temp_docx):
"""测试正常 DOCX 文件解析。"""
# 创建包含多种内容的测试文件
file_path = temp_docx(
headings=[(1, "主标题"), (2, "子标题")],
paragraphs=["这是第一段内容。", "这是第二段内容。"],
table_data=[["列1", "列2"], ["数据1", "数据2"]],
list_items=["列表项1", "列表项2"]
)
content, error = markitdown.parse(file_path)
# 验证解析成功
if content is not None:
# 验证关键内容存在MarkItDown 可能有不同的格式化方式)
assert "主标题" in content or "子标题" in content or "第一段内容" in content
def test_file_not_exists(self, tmp_path):
"""测试文件不存在的情况。"""
non_existent_file = str(tmp_path / "non_existent.docx")
content, error = markitdown.parse(non_existent_file)
# 验证返回 None 和错误信息
assert content is None
assert error is not None
def test_empty_file(self, temp_docx):
"""测试空 DOCX 文件。"""
# 创建没有任何内容的文件
file_path = temp_docx()
content, error = markitdown.parse(file_path)
# 空文件可能返回 None 或空字符串
assert content is None or content.strip() == ""
def test_corrupted_file(self, temp_docx, tmp_path):
"""测试损坏的 DOCX 文件。"""
# 先创建正常文件
file_path = temp_docx(paragraphs=["测试内容"])
# 破坏文件内容
with open(file_path, "wb") as f:
f.write(b"corrupted content that is not a valid docx file")
content, error = markitdown.parse(file_path)
# MarkItDown 可能会尝试解析任何内容,所以不强制要求返回 None
# 只验证它不会崩溃
assert content is not None or error is not None
def test_special_chars(self, temp_docx):
"""测试特殊字符处理。"""
special_texts = [
"中文测试内容",
"Emoji测试: 😀🎉🚀",
"特殊符号: ©®™°±",
"混合内容: Hello你好🎉World世界",
"阿拉伯文: مرحبا", # RTL 文本
]
file_path = temp_docx(paragraphs=special_texts)
content, error = markitdown.parse(file_path)
# 如果解析成功,验证特殊字符处理
if content is not None:
assert "中文测试内容" in content or "😀" in content or "Hello你好" in content