lyxy-document/readers/html/markitdown.py

"""使用 MarkItDown 解析 HTML"""

import os
import tempfile
from typing import Optional, Tuple


def parse(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
    """使用 MarkItDown 解析 HTML"""
    try:
        from markitdown import MarkItDown
    except ImportError:
        return None, "MarkItDown 库未安装"

    try:
        input_path = temp_file_path
        if not input_path or not os.path.exists(input_path):
            # 创建临时文件
            fd, input_path = tempfile.mkstemp(suffix='.html')
            with os.fdopen(fd, 'w', encoding='utf-8') as f:
                f.write(html_content)

        md = MarkItDown()
        result = md.convert(
            input_path,
            heading_style="ATX",
            strip=["img", "script", "style", "noscript"],
        )
        markdown_content = result.text_content

        if not temp_file_path:
            try:
                os.unlink(input_path)
            except Exception:
                pass

        if not markdown_content.strip():
            return None, "解析内容为空"
        return markdown_content, None
    except Exception as e:
        return None, f"MarkItDown 解析失败: {str(e)}"