创建 lyxy-reader-html skill
- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容 - 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退) - 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退) - 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索 - 新增 spec: html-document-parsing - 归档 change: create-lyxy-reader-html-skill
This commit is contained in:
128
skills/lyxy-reader-html/scripts/parser.py
Normal file
128
skills/lyxy-reader-html/scripts/parser.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HTML 解析器命令行交互模块,提供命令行接口。支持 URL 和 HTML 文件。"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# 抑制第三方库的进度条和日志,仅保留解析结果输出
|
||||
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||
os.environ["TQDM_DISABLE"] = "1"
|
||||
warnings.filterwarnings("ignore")
|
||||
logging.disable(logging.WARNING)
|
||||
|
||||
import common
|
||||
import downloader
|
||||
import html_parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="将 URL 或 HTML 文件解析为 Markdown"
|
||||
)
|
||||
|
||||
parser.add_argument("input", help="URL 或 HTML 文件的路径")
|
||||
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--context",
|
||||
type=int,
|
||||
default=2,
|
||||
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
||||
)
|
||||
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
||||
)
|
||||
group.add_argument(
|
||||
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
|
||||
)
|
||||
group.add_argument(
|
||||
"-t",
|
||||
"--titles",
|
||||
action="store_true",
|
||||
help="返回解析后的 markdown 文档的标题行(1-6级)",
|
||||
)
|
||||
group.add_argument(
|
||||
"-tc",
|
||||
"--title-content",
|
||||
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
|
||||
)
|
||||
group.add_argument(
|
||||
"-s",
|
||||
"--search",
|
||||
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 判断输入类型
|
||||
html_content = None
|
||||
temp_file_path = None
|
||||
|
||||
if common.is_url(args.input):
|
||||
# URL 模式
|
||||
html_content, download_failures = downloader.download_html(args.input)
|
||||
if html_content is None:
|
||||
print("所有下载方法均失败:")
|
||||
for failure in download_failures:
|
||||
print(failure)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# HTML 文件模式
|
||||
if not os.path.exists(args.input):
|
||||
print(f"错误: 文件不存在: {args.input}")
|
||||
sys.exit(1)
|
||||
if not common.is_html_file(args.input):
|
||||
print(f"错误: 不是有效的 HTML 文件: {args.input}")
|
||||
sys.exit(1)
|
||||
with open(args.input, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
temp_file_path = args.input
|
||||
|
||||
# HTML 预处理清理
|
||||
cleaned_html = common.clean_html_content(html_content)
|
||||
|
||||
# 解析 HTML
|
||||
markdown_content, parse_failures = html_parser.parse_html(cleaned_html, temp_file_path)
|
||||
if markdown_content is None:
|
||||
print("所有解析方法均失败:")
|
||||
for failure in parse_failures:
|
||||
print(failure)
|
||||
sys.exit(1)
|
||||
|
||||
# Markdown 后处理
|
||||
markdown_content = common.remove_markdown_images(markdown_content)
|
||||
markdown_content = common.normalize_markdown_whitespace(markdown_content)
|
||||
|
||||
# 根据参数输出
|
||||
if args.count:
|
||||
print(len(markdown_content.replace("\n", "")))
|
||||
elif args.lines:
|
||||
print(len(markdown_content.split("\n")))
|
||||
elif args.titles:
|
||||
titles = common.extract_titles(markdown_content)
|
||||
for title in titles:
|
||||
print(title)
|
||||
elif args.title_content:
|
||||
title_content = common.extract_title_content(markdown_content, args.title_content)
|
||||
if title_content is None:
|
||||
print(f"错误: 未找到标题 '{args.title_content}'")
|
||||
sys.exit(1)
|
||||
print(title_content, end="")
|
||||
elif args.search:
|
||||
search_result = common.search_markdown(markdown_content, args.search, args.context)
|
||||
if search_result is None:
|
||||
print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
|
||||
sys.exit(1)
|
||||
print(search_result, end="")
|
||||
else:
|
||||
print(markdown_content, end="")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user