Skill/skills/lyxy-reader-html/scripts/html_parser.py

#!/usr/bin/env python3
"""HTML 解析模块，按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。"""

from typing import Optional, Tuple


def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 trafilatura 解析 HTML。"""
    try:
        import trafilatura
    except ImportError:
        return None, "trafilatura 库未安装"

    try:
        markdown_content = trafilatura.extract(
            html_content,
            output_format="markdown",
            include_formatting=True,
            include_links=True,
            include_images=False,
            include_tables=True,
            favor_recall=True,
            include_comments=True,
        )
        if markdown_content is None:
            return None, "trafilatura 返回 None"
        if not markdown_content.strip():
            return None, "解析内容为空"
        return markdown_content, None
    except Exception as e:
        return None, f"trafilatura 解析失败: {str(e)}"


def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 domscribe 解析 HTML。"""
    try:
        from domscribe import html_to_markdown
    except ImportError:
        return None, "domscribe 库未安装"

    try:
        options = {
            'extract_main_content': True,
        }
        markdown_content = html_to_markdown(html_content, options)
        if not markdown_content.strip():
            return None, "解析内容为空"
        return markdown_content, None
    except Exception as e:
        return None, f"domscribe 解析失败: {str(e)}"


def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
    """使用 MarkItDown 解析 HTML。"""
    try:
        from markitdown import MarkItDown
    except ImportError:
        return None, "MarkItDown 库未安装"

    try:
        import tempfile
        import os

        input_path = temp_file_path
        if not input_path or not os.path.exists(input_path):
            # 创建临时文件
            fd, input_path = tempfile.mkstemp(suffix='.html')
            with os.fdopen(fd, 'w', encoding='utf-8') as f:
                f.write(html_content)

        md = MarkItDown()
        result = md.convert(
            input_path,
            heading_style="ATX",
            strip=["img", "script", "style", "noscript"],
        )
        markdown_content = result.text_content

        if not temp_file_path:
            try:
                os.unlink(input_path)
            except Exception:
                pass

        if not markdown_content.strip():
            return None, "解析内容为空"
        return markdown_content, None
    except Exception as e:
        return None, f"MarkItDown 解析失败: {str(e)}"


def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 html2text 解析 HTML（兜底方案）。"""
    try:
        import html2text
    except ImportError:
        return None, "html2text 库未安装"

    try:
        converter = html2text.HTML2Text()
        converter.ignore_emphasis = False
        converter.ignore_links = False
        converter.ignore_images = True
        converter.body_width = 0
        converter.skip_internal_links = True
        markdown_content = converter.handle(html_content)
        if not markdown_content.strip():
            return None, "解析内容为空"
        return markdown_content, None
    except Exception as e:
        return None, f"html2text 解析失败: {str(e)}"


def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]:
    """
    统一的 HTML 解析入口函数，按优先级尝试各解析器。

    返回: (content, failures)
    - content: 成功时返回 Markdown 内容，失败时返回 None
    - failures: 各解析器的失败原因列表
    """
    failures = []
    content = None

    # 按优先级尝试各解析器
    parsers = [
        ("trafilatura", lambda c: parse_with_trafilatura(c)),
        ("domscribe", lambda c: parse_with_domscribe(c)),
        ("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)),
        ("html2text", lambda c: parse_with_html2text(c)),
    ]

    for name, func in parsers:
        content, error = func(html_content)
        if content is not None:
            return content, failures
        else:
            failures.append(f"- {name}: {error}")

    return None, failures