1
0

创建 lyxy-reader-html skill

- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容
- 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退)
- 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退)
- 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索
- 新增 spec: html-document-parsing
- 归档 change: create-lyxy-reader-html-skill
This commit is contained in:
2026-03-08 02:02:03 +08:00
parent 0bd9ec8a36
commit 6b4fcf2647
16 changed files with 1827 additions and 3 deletions

View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""HTML 解析器的公共模块,包含 HTML 清理、Markdown 处理等工具函数。"""
import re
from typing import List, Optional
from bs4 import BeautifulSoup
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
_CONSECUTIVE_BLANK_LINES = re.compile(r"\n{3,}")
def clean_html_content(html_content: str) -> str:
"""清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。"""
soup = BeautifulSoup(html_content, "html.parser")
# Remove all script tags
for script in soup.find_all("script"):
script.decompose()
# Remove all style tags
for style in soup.find_all("style"):
style.decompose()
# Remove all svg tags
for svg in soup.find_all("svg"):
svg.decompose()
# Remove all link tags
for link in soup.find_all("link"):
link.decompose()
# Remove URLs from href and src attributes
for tag in soup.find_all(True):
if "href" in tag.attrs:
del tag["href"]
if "src" in tag.attrs:
del tag["src"]
if "srcset" in tag.attrs:
del tag["srcset"]
if "action" in tag.attrs:
del tag["action"]
data_attrs = [
attr
for attr in tag.attrs
if attr.startswith("data-") and "src" in attr.lower()
]
for attr in data_attrs:
del tag[attr]
# Remove all style attributes from all tags
for tag in soup.find_all(True):
if "style" in tag.attrs:
del tag["style"]
# Remove data-href attributes
for tag in soup.find_all(True):
if "data-href" in tag.attrs:
del tag["data-href"]
# Remove URLs from title attributes
for tag in soup.find_all(True):
if "title" in tag.attrs:
title = tag["title"]
cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
tag["title"] = cleaned_title
# Remove class attributes that contain URL-like patterns
for tag in soup.find_all(True):
if "class" in tag.attrs:
classes = tag["class"]
cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
tag["class"] = cleaned_classes
return str(soup)
def remove_markdown_images(markdown_text: str) -> str:
"""移除 Markdown 文本中的图片标记。"""
return IMAGE_PATTERN.sub("", markdown_text)
def normalize_markdown_whitespace(content: str) -> str:
"""规范化 Markdown 空白字符,保留单行空行。"""
return _CONSECUTIVE_BLANK_LINES.sub("\n\n", content)
def get_heading_level(line: str) -> int:
"""获取 Markdown 行的标题级别1-6非标题返回 0。"""
stripped = line.lstrip()
if not stripped.startswith("#"):
return 0
without_hash = stripped.lstrip("#")
level = len(stripped) - len(without_hash)
if not (1 <= level <= 6):
return 0
if len(stripped) == level:
return level
if stripped[level] != " ":
return 0
return level
def extract_titles(markdown_text: str) -> List[str]:
"""提取 markdown 文本中的所有标题行1-6级"""
title_lines = []
for line in markdown_text.split("\n"):
if get_heading_level(line) > 0:
title_lines.append(line.lstrip())
return title_lines
def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
"""提取所有指定标题及其下级内容(每个包含上级标题)。"""
lines = markdown_text.split("\n")
match_indices = []
for i, line in enumerate(lines):
level = get_heading_level(line)
if level > 0:
stripped = line.lstrip()
title_text = stripped[level:].strip()
if title_text == title_name:
match_indices.append(i)
if not match_indices:
return None
result_lines = []
for match_num, idx in enumerate(match_indices):
if match_num > 0:
result_lines.append("\n---\n")
target_level = get_heading_level(lines[idx])
parent_titles = []
current_level = target_level
for i in range(idx - 1, -1, -1):
line_level = get_heading_level(lines[i])
if line_level > 0 and line_level < current_level:
parent_titles.append(lines[i])
current_level = line_level
if current_level == 1:
break
parent_titles.reverse()
result_lines.extend(parent_titles)
result_lines.append(lines[idx])
for i in range(idx + 1, len(lines)):
line = lines[i]
line_level = get_heading_level(line)
if line_level == 0 or line_level > target_level:
result_lines.append(line)
else:
break
return "\n".join(result_lines)
def search_markdown(
content: str, pattern: str, context_lines: int = 0
) -> Optional[str]:
"""使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文。"""
try:
regex = re.compile(pattern)
except re.error:
return None
lines = content.split("\n")
non_empty_indices = []
non_empty_to_original = {}
for i, line in enumerate(lines):
if line.strip():
non_empty_indices.append(i)
non_empty_to_original[i] = len(non_empty_indices) - 1
matched_non_empty_indices = []
for orig_idx in non_empty_indices:
if regex.search(lines[orig_idx]):
matched_non_empty_indices.append(non_empty_to_original[orig_idx])
if not matched_non_empty_indices:
return None
merged_ranges = []
current_start = matched_non_empty_indices[0]
current_end = matched_non_empty_indices[0]
for idx in matched_non_empty_indices[1:]:
if idx - current_end <= context_lines * 2:
current_end = idx
else:
merged_ranges.append((current_start, current_end))
current_start = idx
current_end = idx
merged_ranges.append((current_start, current_end))
results = []
for start, end in merged_ranges:
context_start_idx = max(0, start - context_lines)
context_end_idx = min(len(non_empty_indices) - 1, end + context_lines)
start_line_idx = non_empty_indices[context_start_idx]
end_line_idx = non_empty_indices[context_end_idx]
result_lines = [
line
for i, line in enumerate(lines)
if start_line_idx <= i <= end_line_idx
]
results.append("\n".join(result_lines))
return "\n---\n".join(results)
def is_url(input_str: str) -> bool:
"""判断输入是否为 URL。"""
return input_str.startswith("http://") or input_str.startswith("https://")
def is_html_file(file_path: str) -> bool:
"""判断文件是否为 HTML 文件(仅检查扩展名)。"""
ext = file_path.lower()
return ext.endswith(".html") or ext.endswith(".htm")