"""使用 MarkItDown 解析 HTML""" import os import tempfile from typing import Optional, Tuple def parse(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 解析 HTML""" try: from markitdown import MarkItDown except ImportError: return None, "MarkItDown 库未安装" try: input_path = temp_file_path if not input_path or not os.path.exists(input_path): # 创建临时文件 fd, input_path = tempfile.mkstemp(suffix='.html') with os.fdopen(fd, 'w', encoding='utf-8') as f: f.write(html_content) md = MarkItDown() result = md.convert( input_path, heading_style="ATX", strip=["img", "script", "style", "noscript"], ) markdown_content = result.text_content if not temp_file_path: try: os.unlink(input_path) except Exception: pass if not markdown_content.strip(): return None, "解析内容为空" return markdown_content, None except Exception as e: return None, f"MarkItDown 解析失败: {str(e)}"