lyxy-document/tests/test_readers/test_html/test_trafilatura_html.py

"""测试 Trafilatura HTML Reader 的解析功能。"""

import pytest
from scripts.readers.html import trafilatura


class TestTrafilaturaHtmlReaderParse:
    """测试 Trafilatura HTML Reader 的 parse 方法。"""

    def test_normal_file(self, temp_html):
        """测试正常 HTML 文件解析。"""
        file_path = temp_html(content="<h1>标题</h1><p>段落内容</p>")
        content, error = trafilatura.parse(file_path)
        if content is not None:
            assert "标题" in content or "段落" in content

    def test_file_not_exists(self, tmp_path):
        """测试文件不存在的情况。"""
        non_existent_path = str(tmp_path / "non_existent.html")
        content, error = trafilatura.parse(non_existent_path)
        assert content is None
        # 如果库未安装，也会返回 None，但错误信息不同
        assert error is not None

    def test_empty_file(self, temp_html):
        """测试空 HTML 文件。"""
        file_path = temp_html(content="<html><body></body></html>")
        content, error = trafilatura.parse(file_path)
        assert content is None or content.strip() == ""

    def test_special_chars(self, temp_html):
        """测试特殊字符处理。"""
        file_path = temp_html(content="<p>中文测试 😀 ©®</p>")
        content, error = trafilatura.parse(file_path)
        if content is not None:
            assert "中文" in content or "测试" in content