- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容 - 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退) - 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退) - 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索 - 新增 spec: html-document-parsing - 归档 change: create-lyxy-reader-html-skill
141 lines
4.5 KiB
Python
141 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
||
"""HTML 解析模块,按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。"""
|
||
|
||
from typing import Optional, Tuple
|
||
|
||
|
||
def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||
"""使用 trafilatura 解析 HTML。"""
|
||
try:
|
||
import trafilatura
|
||
except ImportError:
|
||
return None, "trafilatura 库未安装"
|
||
|
||
try:
|
||
markdown_content = trafilatura.extract(
|
||
html_content,
|
||
output_format="markdown",
|
||
include_formatting=True,
|
||
include_links=True,
|
||
include_images=False,
|
||
include_tables=True,
|
||
favor_recall=True,
|
||
include_comments=True,
|
||
)
|
||
if markdown_content is None:
|
||
return None, "trafilatura 返回 None"
|
||
if not markdown_content.strip():
|
||
return None, "解析内容为空"
|
||
return markdown_content, None
|
||
except Exception as e:
|
||
return None, f"trafilatura 解析失败: {str(e)}"
|
||
|
||
|
||
def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||
"""使用 domscribe 解析 HTML。"""
|
||
try:
|
||
from domscribe import html_to_markdown
|
||
except ImportError:
|
||
return None, "domscribe 库未安装"
|
||
|
||
try:
|
||
options = {
|
||
'extract_main_content': True,
|
||
}
|
||
markdown_content = html_to_markdown(html_content, options)
|
||
if not markdown_content.strip():
|
||
return None, "解析内容为空"
|
||
return markdown_content, None
|
||
except Exception as e:
|
||
return None, f"domscribe 解析失败: {str(e)}"
|
||
|
||
|
||
def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
|
||
"""使用 MarkItDown 解析 HTML。"""
|
||
try:
|
||
from markitdown import MarkItDown
|
||
except ImportError:
|
||
return None, "MarkItDown 库未安装"
|
||
|
||
try:
|
||
import tempfile
|
||
import os
|
||
|
||
input_path = temp_file_path
|
||
if not input_path or not os.path.exists(input_path):
|
||
# 创建临时文件
|
||
fd, input_path = tempfile.mkstemp(suffix='.html')
|
||
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
||
f.write(html_content)
|
||
|
||
md = MarkItDown()
|
||
result = md.convert(
|
||
input_path,
|
||
heading_style="ATX",
|
||
strip=["img", "script", "style", "noscript"],
|
||
)
|
||
markdown_content = result.text_content
|
||
|
||
if not temp_file_path:
|
||
try:
|
||
os.unlink(input_path)
|
||
except Exception:
|
||
pass
|
||
|
||
if not markdown_content.strip():
|
||
return None, "解析内容为空"
|
||
return markdown_content, None
|
||
except Exception as e:
|
||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||
|
||
|
||
def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||
"""使用 html2text 解析 HTML(兜底方案)。"""
|
||
try:
|
||
import html2text
|
||
except ImportError:
|
||
return None, "html2text 库未安装"
|
||
|
||
try:
|
||
converter = html2text.HTML2Text()
|
||
converter.ignore_emphasis = False
|
||
converter.ignore_links = False
|
||
converter.ignore_images = True
|
||
converter.body_width = 0
|
||
converter.skip_internal_links = True
|
||
markdown_content = converter.handle(html_content)
|
||
if not markdown_content.strip():
|
||
return None, "解析内容为空"
|
||
return markdown_content, None
|
||
except Exception as e:
|
||
return None, f"html2text 解析失败: {str(e)}"
|
||
|
||
|
||
def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]:
|
||
"""
|
||
统一的 HTML 解析入口函数,按优先级尝试各解析器。
|
||
|
||
返回: (content, failures)
|
||
- content: 成功时返回 Markdown 内容,失败时返回 None
|
||
- failures: 各解析器的失败原因列表
|
||
"""
|
||
failures = []
|
||
content = None
|
||
|
||
# 按优先级尝试各解析器
|
||
parsers = [
|
||
("trafilatura", lambda c: parse_with_trafilatura(c)),
|
||
("domscribe", lambda c: parse_with_domscribe(c)),
|
||
("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)),
|
||
("html2text", lambda c: parse_with_html2text(c)),
|
||
]
|
||
|
||
for name, func in parsers:
|
||
content, error = func(html_content)
|
||
if content is not None:
|
||
return content, failures
|
||
else:
|
||
failures.append(f"- {name}: {error}")
|
||
|
||
return None, failures
|