创建 lyxy-reader-html skill

- 新增 skill: lyxy-reader-html，用于解析 HTML 文件和 URL 网页内容 - 支持 URL 下载（pyppeteer → selenium → httpx → urllib 优先级回退） - 支持 HTML 解析（trafilatura → domscribe → MarkItDown → html2text 优先级回退） - 支持查询功能：全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索 - 新增 spec: html-document-parsing - 归档 change: create-lyxy-reader-html-skill
2026-03-08 02:02:03 +08:00
parent 0bd9ec8a36
commit 6b4fcf2647
16 changed files with 1827 additions and 3 deletions
--- a/skills/lyxy-reader-html/scripts/html_parser.py
+++ b/skills/lyxy-reader-html/scripts/html_parser.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""HTML 解析模块，按 trafilatura → domscribe → MarkItDown → html2text 优先级尝试解析。"""
+
+from typing import Optional, Tuple
+
+
+def parse_with_trafilatura(html_content: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 trafilatura 解析 HTML。"""
+    try:
+        import trafilatura
+    except ImportError:
+        return None, "trafilatura 库未安装"
+
+    try:
+        markdown_content = trafilatura.extract(
+            html_content,
+            output_format="markdown",
+            include_formatting=True,
+            include_links=True,
+            include_images=False,
+            include_tables=True,
+            favor_recall=True,
+            include_comments=True,
+        )
+        if markdown_content is None:
+            return None, "trafilatura 返回 None"
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"trafilatura 解析失败: {str(e)}"
+
+
+def parse_with_domscribe(html_content: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 domscribe 解析 HTML。"""
+    try:
+        from domscribe import html_to_markdown
+    except ImportError:
+        return None, "domscribe 库未安装"
+
+    try:
+        options = {
+            'extract_main_content': True,
+        }
+        markdown_content = html_to_markdown(html_content, options)
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"domscribe 解析失败: {str(e)}"
+
+
+def parse_with_markitdown(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
+    """使用 MarkItDown 解析 HTML。"""
+    try:
+        from markitdown import MarkItDown
+    except ImportError:
+        return None, "MarkItDown 库未安装"
+
+    try:
+        import tempfile
+        import os
+
+        input_path = temp_file_path
+        if not input_path or not os.path.exists(input_path):
+            # 创建临时文件
+            fd, input_path = tempfile.mkstemp(suffix='.html')
+            with os.fdopen(fd, 'w', encoding='utf-8') as f:
+                f.write(html_content)
+
+        md = MarkItDown()
+        result = md.convert(
+            input_path,
+            heading_style="ATX",
+            strip=["img", "script", "style", "noscript"],
+        )
+        markdown_content = result.text_content
+
+        if not temp_file_path:
+            try:
+                os.unlink(input_path)
+            except Exception:
+                pass
+
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"MarkItDown 解析失败: {str(e)}"
+
+
+def parse_with_html2text(html_content: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 html2text 解析 HTML（兜底方案）。"""
+    try:
+        import html2text
+    except ImportError:
+        return None, "html2text 库未安装"
+
+    try:
+        converter = html2text.HTML2Text()
+        converter.ignore_emphasis = False
+        converter.ignore_links = False
+        converter.ignore_images = True
+        converter.body_width = 0
+        converter.skip_internal_links = True
+        markdown_content = converter.handle(html_content)
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"html2text 解析失败: {str(e)}"
+
+
+def parse_html(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], List[str]]:
+    """
+    统一的 HTML 解析入口函数，按优先级尝试各解析器。
+
+    返回: (content, failures)
+    - content: 成功时返回 Markdown 内容，失败时返回 None
+    - failures: 各解析器的失败原因列表
+    """
+    failures = []
+    content = None
+
+    # 按优先级尝试各解析器
+    parsers = [
+        ("trafilatura", lambda c: parse_with_trafilatura(c)),
+        ("domscribe", lambda c: parse_with_domscribe(c)),
+        ("MarkItDown", lambda c: parse_with_markitdown(c, temp_file_path)),
+        ("html2text", lambda c: parse_with_html2text(c)),
+    ]
+
+    for name, func in parsers:
+        content, error = func(html_content)
+        if content is not None:
+            return content, failures
+        else:
+            failures.append(f"- {name}: {error}")
+
+    return None, failures