lyxy-document/tests/test_readers/test_html/test_consistency.py

"""测试所有 HTML Readers 的一致性。"""

import pytest
from scripts.readers.html import (
    html2text,
    markitdown,
    trafilatura,
    domscribe,
)


class TestHtmlReadersConsistency:
    """验证所有 HTML Readers 解析同一文件时核心文字内容一致。"""

    def test_all_readers_parse_same_content(self, temp_html):
        """测试所有 Readers 解析同一文件时核心内容一致。"""
        file_path = temp_html(content="""
            <html>
            <head><title>测试页面</title></head>
            <body>
                <h1>测试标题</h1>
                <p>这是测试段落内容。</p>
                <p>第二段内容。</p>
            </body>
            </html>
        """)

        # 读取 HTML 内容
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        parsers = [
            ("html2text", lambda c: html2text.parse(c)),
            ("markitdown", lambda c: markitdown.parse(c, file_path)),
            ("trafilatura", lambda c: trafilatura.parse(c)),
            ("domscribe", lambda c: domscribe.parse(c)),
        ]

        successful_results = []
        for name, parser in parsers:
            content, error = parser(html_content)
            if content is not None and content.strip():
                successful_results.append((name, content))

        assert len(successful_results) > 0, "没有任何 reader 成功解析文件"

        core_texts = ["测试标题", "测试段落", "内容", "第二段"]
        for name, content in successful_results:
            assert any(text in content for text in core_texts), \
                f"{name} 解析结果不包含核心内容"