#!/usr/bin/env python3 """HTML 解析模块,按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。""" from typing import Optional, Tuple def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]: """使用 trafilatura 解析 HTML。""" try: import trafilatura except ImportError: return None, "trafilatura 库未安装" try: markdown_content = trafilatura.extract( html_content, output_format="markdown", include_formatting=True, include_links=True, include_images=False, include_tables=True, favor_recall=True, include_comments=True, ) if markdown_content is None: return None, "trafilatura 返回 None" if not markdown_content.strip(): return None, "解析内容为空" return markdown_content, None except Exception as e: return None, f"trafilatura 解析失败: {str(e)}" def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]: """使用 domscribe 解析 HTML。""" try: from domscribe import html_to_markdown except ImportError: return None, "domscribe 库未安装" try: options = { 'extract_main_content': True, } markdown_content = html_to_markdown(html_content, options) if not markdown_content.strip(): return None, "解析内容为空" return markdown_content, None except Exception as e: return None, f"domscribe 解析失败: {str(e)}" def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 解析 HTML。""" try: from markitdown import MarkItDown except ImportError: return None, "MarkItDown 库未安装" try: import tempfile import os input_path = temp_file_path if not input_path or not os.path.exists(input_path): # 创建临时文件 fd, input_path = tempfile.mkstemp(suffix='.html') with os.fdopen(fd, 'w', encoding='utf-8') as f: f.write(html_content) md = MarkItDown() result = md.convert( input_path, heading_style="ATX", strip=["img", "script", "style", "noscript"], ) markdown_content = result.text_content if not temp_file_path: try: os.unlink(input_path) except Exception: pass if not markdown_content.strip(): return None, "解析内容为空" return markdown_content, None except Exception as e: return None, f"MarkItDown 解析失败: {str(e)}" def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]: """使用 html2text 解析 HTML(兜底方案)。""" try: import html2text except ImportError: return None, "html2text 库未安装" try: converter = html2text.HTML2Text() converter.ignore_emphasis = False converter.ignore_links = False converter.ignore_images = True converter.body_width = 0 converter.skip_internal_links = True markdown_content = converter.handle(html_content) if not markdown_content.strip(): return None, "解析内容为空" return markdown_content, None except Exception as e: return None, f"html2text 解析失败: {str(e)}" def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]: """ 统一的 HTML 解析入口函数,按优先级尝试各解析器。 返回: (content, failures) - content: 成功时返回 Markdown 内容,失败时返回 None - failures: 各解析器的失败原因列表 """ failures = [] content = None # 按优先级尝试各解析器 parsers = [ ("trafilatura", lambda c: parse_with_trafilatura(c)), ("domscribe", lambda c: parse_with_domscribe(c)), ("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)), ("html2text", lambda c: parse_with_html2text(c)), ] for name, func in parsers: content, error = func(html_content) if content is not None: return content, failures else: failures.append(f"- {name}: {error}") return None, failures