创建 lyxy-reader-html skill
- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容 - 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退) - 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退) - 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索 - 新增 spec: html-document-parsing - 归档 change: create-lyxy-reader-html-skill
This commit is contained in:
140
skills/lyxy-reader-html/scripts/html_parser.py
Normal file
140
skills/lyxy-reader-html/scripts/html_parser.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HTML 解析模块,按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 trafilatura 解析 HTML。"""
|
||||
try:
|
||||
import trafilatura
|
||||
except ImportError:
|
||||
return None, "trafilatura 库未安装"
|
||||
|
||||
try:
|
||||
markdown_content = trafilatura.extract(
|
||||
html_content,
|
||||
output_format="markdown",
|
||||
include_formatting=True,
|
||||
include_links=True,
|
||||
include_images=False,
|
||||
include_tables=True,
|
||||
favor_recall=True,
|
||||
include_comments=True,
|
||||
)
|
||||
if markdown_content is None:
|
||||
return None, "trafilatura 返回 None"
|
||||
if not markdown_content.strip():
|
||||
return None, "解析内容为空"
|
||||
return markdown_content, None
|
||||
except Exception as e:
|
||||
return None, f"trafilatura 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 domscribe 解析 HTML。"""
|
||||
try:
|
||||
from domscribe import html_to_markdown
|
||||
except ImportError:
|
||||
return None, "domscribe 库未安装"
|
||||
|
||||
try:
|
||||
options = {
|
||||
'extract_main_content': True,
|
||||
}
|
||||
markdown_content = html_to_markdown(html_content, options)
|
||||
if not markdown_content.strip():
|
||||
return None, "解析内容为空"
|
||||
return markdown_content, None
|
||||
except Exception as e:
|
||||
return None, f"domscribe 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 解析 HTML。"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
|
||||
try:
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
input_path = temp_file_path
|
||||
if not input_path or not os.path.exists(input_path):
|
||||
# 创建临时文件
|
||||
fd, input_path = tempfile.mkstemp(suffix='.html')
|
||||
with os.fdopen(fd, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(
|
||||
input_path,
|
||||
heading_style="ATX",
|
||||
strip=["img", "script", "style", "noscript"],
|
||||
)
|
||||
markdown_content = result.text_content
|
||||
|
||||
if not temp_file_path:
|
||||
try:
|
||||
os.unlink(input_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not markdown_content.strip():
|
||||
return None, "解析内容为空"
|
||||
return markdown_content, None
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 html2text 解析 HTML(兜底方案)。"""
|
||||
try:
|
||||
import html2text
|
||||
except ImportError:
|
||||
return None, "html2text 库未安装"
|
||||
|
||||
try:
|
||||
converter = html2text.HTML2Text()
|
||||
converter.ignore_emphasis = False
|
||||
converter.ignore_links = False
|
||||
converter.ignore_images = True
|
||||
converter.body_width = 0
|
||||
converter.skip_internal_links = True
|
||||
markdown_content = converter.handle(html_content)
|
||||
if not markdown_content.strip():
|
||||
return None, "解析内容为空"
|
||||
return markdown_content, None
|
||||
except Exception as e:
|
||||
return None, f"html2text 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]:
|
||||
"""
|
||||
统一的 HTML 解析入口函数,按优先级尝试各解析器。
|
||||
|
||||
返回: (content, failures)
|
||||
- content: 成功时返回 Markdown 内容,失败时返回 None
|
||||
- failures: 各解析器的失败原因列表
|
||||
"""
|
||||
failures = []
|
||||
content = None
|
||||
|
||||
# 按优先级尝试各解析器
|
||||
parsers = [
|
||||
("trafilatura", lambda c: parse_with_trafilatura(c)),
|
||||
("domscribe", lambda c: parse_with_domscribe(c)),
|
||||
("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)),
|
||||
("html2text", lambda c: parse_with_html2text(c)),
|
||||
]
|
||||
|
||||
for name, func in parsers:
|
||||
content, error = func(html_content)
|
||||
if content is not None:
|
||||
return content, failures
|
||||
else:
|
||||
failures.append(f"- {name}: {error}")
|
||||
|
||||
return None, failures
|
||||
Reference in New Issue
Block a user