1
0
Files
Skill/skills/lyxy-reader-html/scripts/html_parser.py
lanyuanxiaoyao 6b4fcf2647 创建 lyxy-reader-html skill
- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容
- 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退)
- 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退)
- 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索
- 新增 spec: html-document-parsing
- 归档 change: create-lyxy-reader-html-skill
2026-03-08 02:02:03 +08:00

141 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""HTML 解析模块,按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。"""
from typing import Optional, Tuple
def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 trafilatura 解析 HTML。"""
try:
import trafilatura
except ImportError:
return None, "trafilatura 库未安装"
try:
markdown_content = trafilatura.extract(
html_content,
output_format="markdown",
include_formatting=True,
include_links=True,
include_images=False,
include_tables=True,
favor_recall=True,
include_comments=True,
)
if markdown_content is None:
return None, "trafilatura 返回 None"
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"trafilatura 解析失败: {str(e)}"
def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 domscribe 解析 HTML。"""
try:
from domscribe import html_to_markdown
except ImportError:
return None, "domscribe 库未安装"
try:
options = {
'extract_main_content': True,
}
markdown_content = html_to_markdown(html_content, options)
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"domscribe 解析失败: {str(e)}"
def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 解析 HTML。"""
try:
from markitdown import MarkItDown
except ImportError:
return None, "MarkItDown 库未安装"
try:
import tempfile
import os
input_path = temp_file_path
if not input_path or not os.path.exists(input_path):
# 创建临时文件
fd, input_path = tempfile.mkstemp(suffix='.html')
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write(html_content)
md = MarkItDown()
result = md.convert(
input_path,
heading_style="ATX",
strip=["img", "script", "style", "noscript"],
)
markdown_content = result.text_content
if not temp_file_path:
try:
os.unlink(input_path)
except Exception:
pass
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 html2text 解析 HTML兜底方案"""
try:
import html2text
except ImportError:
return None, "html2text 库未安装"
try:
converter = html2text.HTML2Text()
converter.ignore_emphasis = False
converter.ignore_links = False
converter.ignore_images = True
converter.body_width = 0
converter.skip_internal_links = True
markdown_content = converter.handle(html_content)
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"html2text 解析失败: {str(e)}"
def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]:
"""
统一的 HTML 解析入口函数,按优先级尝试各解析器。
返回: (content, failures)
- content: 成功时返回 Markdown 内容,失败时返回 None
- failures: 各解析器的失败原因列表
"""
failures = []
content = None
# 按优先级尝试各解析器
parsers = [
("trafilatura", lambda c: parse_with_trafilatura(c)),
("domscribe", lambda c: parse_with_domscribe(c)),
("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)),
("html2text", lambda c: parse_with_html2text(c)),
]
for name, func in parsers:
content, error = func(html_content)
if content is not None:
return content, failures
else:
failures.append(f"- {name}: {error}")
return None, failures