1
0
Files
Skill/skills/lyxy-reader-html/scripts/parser.py
lanyuanxiaoyao 6b4fcf2647 创建 lyxy-reader-html skill
- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容
- 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退)
- 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退)
- 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索
- 新增 spec: html-document-parsing
- 归档 change: create-lyxy-reader-html-skill
2026-03-08 02:02:03 +08:00

129 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""HTML 解析器命令行交互模块,提供命令行接口。支持 URL 和 HTML 文件。"""
import argparse
import logging
import os
import sys
import warnings
# 抑制第三方库的进度条和日志,仅保留解析结果输出
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TQDM_DISABLE"] = "1"
warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
import common
import downloader
import html_parser
def main() -> None:
parser = argparse.ArgumentParser(
description="将 URL 或 HTML 文件解析为 Markdown"
)
parser.add_argument("input", help="URL 或 HTML 文件的路径")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
# 判断输入类型
html_content = None
temp_file_path = None
if common.is_url(args.input):
# URL 模式
html_content, download_failures = downloader.download_html(args.input)
if html_content is None:
print("所有下载方法均失败:")
for failure in download_failures:
print(failure)
sys.exit(1)
else:
# HTML 文件模式
if not os.path.exists(args.input):
print(f"错误: 文件不存在: {args.input}")
sys.exit(1)
if not common.is_html_file(args.input):
print(f"错误: 不是有效的 HTML 文件: {args.input}")
sys.exit(1)
with open(args.input, "r", encoding="utf-8") as f:
html_content = f.read()
temp_file_path = args.input
# HTML 预处理清理
cleaned_html = common.clean_html_content(html_content)
# 解析 HTML
markdown_content, parse_failures = html_parser.parse_html(cleaned_html, temp_file_path)
if markdown_content is None:
print("所有解析方法均失败:")
for failure in parse_failures:
print(failure)
sys.exit(1)
# Markdown 后处理
markdown_content = common.remove_markdown_images(markdown_content)
markdown_content = common.normalize_markdown_whitespace(markdown_content)
# 根据参数输出
if args.count:
print(len(markdown_content.replace("\n", "")))
elif args.lines:
print(len(markdown_content.split("\n")))
elif args.titles:
titles = common.extract_titles(markdown_content)
for title in titles:
print(title)
elif args.title_content:
title_content = common.extract_title_content(markdown_content, args.title_content)
if title_content is None:
print(f"错误: 未找到标题 '{args.title_content}'")
sys.exit(1)
print(title_content, end="")
elif args.search:
search_result = common.search_markdown(markdown_content, args.search, args.context)
if search_result is None:
print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
sys.exit(1)
print(search_result, end="")
else:
print(markdown_content, end="")
if __name__ == "__main__":
main()