lyxy-document/tests/test_readers/test_pdf/test_unstructured_pdf.py

"""测试 Unstructured PDF Reader 的解析功能。"""

import pytest
from readers.pdf import unstructured


class TestUnstructuredPdfReaderParse:
    """测试 Unstructured PDF Reader 的 parse 方法。"""

    def test_normal_file(self, temp_pdf):
        """测试正常 PDF 文件解析。"""
        file_path = temp_pdf(text="测试PDF内容\n第二行内容")
        content, error = unstructured.parse(file_path)
        if content is not None:
            assert "测试" in content or "PDF" in content or "内容" in content

    def test_file_not_exists(self, tmp_path):
        """测试文件不存在的情况。"""
        non_existent_file = str(tmp_path / "non_existent.pdf")
        content, error = unstructured.parse(non_existent_file)
        assert content is None
        assert error is not None

    def test_empty_file(self, temp_pdf):
        """测试空 PDF 文件。"""
        file_path = temp_pdf()
        content, error = unstructured.parse(file_path)
        assert content is None or content.strip() == ""

    def test_corrupted_file(self, temp_pdf, tmp_path):
        """测试损坏的 PDF 文件。"""
        file_path = temp_pdf(text="测试内容")
        with open(file_path, "wb") as f:
            f.write(b"corrupted content")
        content, error = unstructured.parse(file_path)
        assert content is None
        assert error is not None

    def test_special_chars(self, temp_pdf):
        """测试特殊字符处理。"""
        file_path = temp_pdf(text="中文测试\nEmoji: 😀\n特殊符号: ©®")
        content, error = unstructured.parse(file_path)
        if content is not None:
            # PDF 解析可能无法完美保留所有字符，只验证部分内容
            # 至少应该包含一些可识别的内容（如特殊符号）
            assert len(content.strip()) > 0