Files
lyxy-document/tests/test_readers/test_pdf/test_pypdf.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

103 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""测试 pypdf Reader 的解析功能。"""
import pytest
import os
from readers.pdf import PdfReader
class TestPypdfReaderParse:
"""测试 pypdf Reader 的 parse 方法。"""
def test_normal_file(self, temp_pdf):
"""测试正常 PDF 文件解析。"""
test_text = "这是测试PDF内容\n第二行内容\n第三行内容"
file_path = temp_pdf(text=test_text)
reader = PdfReader()
content, failures = reader.parse(file_path)
# 验证解析成功
assert content is not None, f"解析失败: {failures}"
# 验证关键内容存在PDF 解析可能有格式差异)
assert "测试PDF内容" in content or "测试" in content
def test_file_not_exists(self, tmp_path):
"""测试文件不存在的情况。"""
non_existent_file = str(tmp_path / "non_existent.pdf")
reader = PdfReader()
content, failures = reader.parse(non_existent_file)
# 验证返回 None 和错误信息
assert content is None
assert len(failures) > 0
assert any("不存在" in f or "找不到" in f for f in failures)
def test_empty_file(self, temp_pdf):
"""测试空 PDF 文件。"""
file_path = temp_pdf(text="")
reader = PdfReader()
content, failures = reader.parse(file_path)
# 空文件应该返回 None 或空字符串
assert content is None or content.strip() == ""
def test_corrupted_file(self, temp_pdf):
"""测试损坏的 PDF 文件。"""
# 先创建正常文件
file_path = temp_pdf(text="测试内容")
# 破坏文件内容
with open(file_path, "r+b") as f:
f.seek(0)
f.write(b"corrupted content")
reader = PdfReader()
content, failures = reader.parse(file_path)
# 验证返回 None 和错误信息
assert content is None
assert len(failures) > 0
def test_special_chars(self, temp_pdf):
"""测试特殊字符处理。"""
# PDF 对特殊字符的支持取决于字体
# 这里测试基本的中文和英文混合
test_text = "中文English混合123"
file_path = temp_pdf(text=test_text)
reader = PdfReader()
content, failures = reader.parse(file_path)
# PDF 解析可能无法完美保留所有字符,只验证部分内容
if content:
# 至少应该包含一些可识别的内容
assert len(content.strip()) > 0
class TestPypdfReaderSupports:
"""测试 pypdf Reader 的 supports 方法。"""
def test_supports_pdf_extension(self):
"""测试识别 .pdf 扩展名。"""
reader = PdfReader()
assert reader.supports("test.pdf") is True
def test_supports_uppercase_extension(self):
"""测试识别大写扩展名。"""
reader = PdfReader()
assert reader.supports("TEST.PDF") is True
def test_rejects_unsupported_format(self):
"""测试拒绝不支持的格式。"""
reader = PdfReader()
assert reader.supports("test.docx") is False
assert reader.supports("test.txt") is False
def test_supports_path_with_spaces(self):
"""测试包含空格的路径。"""
reader = PdfReader()
assert reader.supports("path with spaces/test.pdf") is True