创建 lyxy-reader-html skill

- 新增 skill: lyxy-reader-html，用于解析 HTML 文件和 URL 网页内容 - 支持 URL 下载（pyppeteer → selenium → httpx → urllib 优先级回退） - 支持 HTML 解析（trafilatura → domscribe → MarkItDown → html2text 优先级回退） - 支持查询功能：全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索 - 新增 spec: html-document-parsing - 归档 change: create-lyxy-reader-html-skill
2026-03-08 02:02:03 +08:00
parent 0bd9ec8a36
commit 6b4fcf2647
16 changed files with 1827 additions and 3 deletions
--- a/skills/lyxy-reader-html/scripts/parser.py
+++ b/skills/lyxy-reader-html/scripts/parser.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+"""HTML 解析器命令行交互模块，提供命令行接口。支持 URL 和 HTML 文件。"""
+
+import argparse
+import logging
+import os
+import sys
+import warnings
+
+# 抑制第三方库的进度条和日志，仅保留解析结果输出
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+os.environ["TQDM_DISABLE"] = "1"
+warnings.filterwarnings("ignore")
+logging.disable(logging.WARNING)
+
+import common
+import downloader
+import html_parser
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="将 URL 或 HTML 文件解析为 Markdown"
+    )
+
+    parser.add_argument("input", help="URL 或 HTML 文件的路径")
+
+    parser.add_argument(
+        "-n",
+        "--context",
+        type=int,
+        default=2,
+        help="与 -s 配合使用，指定每个检索结果包含的前后行数（不包含空行）",
+    )
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
+    )
+    group.add_argument(
+        "-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
+    )
+    group.add_argument(
+        "-t",
+        "--titles",
+        action="store_true",
+        help="返回解析后的 markdown 文档的标题行（1-6级）",
+    )
+    group.add_argument(
+        "-tc",
+        "--title-content",
+        help="指定标题名称，输出该标题及其下级内容（不包含#号）",
+    )
+    group.add_argument(
+        "-s",
+        "--search",
+        help="使用正则表达式搜索文档，返回所有匹配结果（用---分隔）",
+    )
+
+    args = parser.parse_args()
+
+    # 判断输入类型
+    html_content = None
+    temp_file_path = None
+
+    if common.is_url(args.input):
+        # URL 模式
+        html_content, download_failures = downloader.download_html(args.input)
+        if html_content is None:
+            print("所有下载方法均失败:")
+            for failure in download_failures:
+                print(failure)
+            sys.exit(1)
+    else:
+        # HTML 文件模式
+        if not os.path.exists(args.input):
+            print(f"错误: 文件不存在: {args.input}")
+            sys.exit(1)
+        if not common.is_html_file(args.input):
+            print(f"错误: 不是有效的 HTML 文件: {args.input}")
+            sys.exit(1)
+        with open(args.input, "r", encoding="utf-8") as f:
+            html_content = f.read()
+        temp_file_path = args.input
+
+    # HTML 预处理清理
+    cleaned_html = common.clean_html_content(html_content)
+
+    # 解析 HTML
+    markdown_content, parse_failures = html_parser.parse_html(cleaned_html, temp_file_path)
+    if markdown_content is None:
+        print("所有解析方法均失败:")
+        for failure in parse_failures:
+            print(failure)
+        sys.exit(1)
+
+    # Markdown 后处理
+    markdown_content = common.remove_markdown_images(markdown_content)
+    markdown_content = common.normalize_markdown_whitespace(markdown_content)
+
+    # 根据参数输出
+    if args.count:
+        print(len(markdown_content.replace("\n", "")))
+    elif args.lines:
+        print(len(markdown_content.split("\n")))
+    elif args.titles:
+        titles = common.extract_titles(markdown_content)
+        for title in titles:
+            print(title)
+    elif args.title_content:
+        title_content = common.extract_title_content(markdown_content, args.title_content)
+        if title_content is None:
+            print(f"错误: 未找到标题 '{args.title_content}'")
+            sys.exit(1)
+        print(title_content, end="")
+    elif args.search:
+        search_result = common.search_markdown(markdown_content, args.search, args.context)
+        if search_result is None:
+            print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
+            sys.exit(1)
+        print(search_result, end="")
+    else:
+        print(markdown_content, end="")
+
+
+if __name__ == "__main__":
+    main()