#!/usr/bin/env python3
"""HTML 解析模块,按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。"""
from typing import Optional, Tuple
def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 trafilatura 解析 HTML。"""
try:
import trafilatura
except ImportError:
return None, "trafilatura 库未安装"
try:
markdown_content = trafilatura.extract(
html_content,
output_format="markdown",
include_formatting=True,
include_links=True,
include_images=False,
include_tables=True,
favor_recall=True,
include_comments=True,
)
if markdown_content is None:
return None, "trafilatura 返回 None"
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"trafilatura 解析失败: {str(e)}"
def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 domscribe 解析 HTML。"""
try:
from domscribe import html_to_markdown
except ImportError:
return None, "domscribe 库未安装"
try:
options = {
'extract_main_content': True,
}
markdown_content = html_to_markdown(html_content, options)
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"domscribe 解析失败: {str(e)}"
def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 解析 HTML。"""
try:
from markitdown import MarkItDown
except ImportError:
return None, "MarkItDown 库未安装"
try:
import tempfile
import os
input_path = temp_file_path
if not input_path or not os.path.exists(input_path):
# 创建临时文件
fd, input_path = tempfile.mkstemp(suffix='.html')
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write(html_content)
md = MarkItDown()
result = md.convert(
input_path,
heading_style="ATX",
strip=["img", "script", "style", "noscript"],
)
markdown_content = result.text_content
if not temp_file_path:
try:
os.unlink(input_path)
except Exception:
pass
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 html2text 解析 HTML(兜底方案)。"""
try:
import html2text
except ImportError:
return None, "html2text 库未安装"
try:
converter = html2text.HTML2Text()
converter.ignore_emphasis = False
converter.ignore_links = False
converter.ignore_images = True
converter.body_width = 0
converter.skip_internal_links = True
markdown_content = converter.handle(html_content)
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"html2text 解析失败: {str(e)}"
def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]:
"""
统一的 HTML 解析入口函数,按优先级尝试各解析器。
返回: (content, failures)
- content: 成功时返回 Markdown 内容,失败时返回 None
- failures: 各解析器的失败原因列表
"""
failures = []
content = None
# 按优先级尝试各解析器
parsers = [
("trafilatura", lambda c: parse_with_trafilatura(c)),
("domscribe", lambda c: parse_with_domscribe(c)),
("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)),
("html2text", lambda c: parse_with_html2text(c)),
]
for name, func in parsers:
content, error = func(html_content)
if content is not None:
return content, failures
else:
failures.append(f"- {name}: {error}")
return None, failures