1
0
Files
Skill/skills/lyxy-reader-html/scripts/common.py
lanyuanxiaoyao 6b4fcf2647 创建 lyxy-reader-html skill
- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容
- 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退)
- 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退)
- 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索
- 新增 spec: html-document-parsing
- 归档 change: create-lyxy-reader-html-skill
2026-03-08 02:02:03 +08:00

226 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""HTML 解析器的公共模块,包含 HTML 清理、Markdown 处理等工具函数。"""
import re
from typing import List, Optional
from bs4 import BeautifulSoup
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
_CONSECUTIVE_BLANK_LINES = re.compile(r"\n{3,}")
def clean_html_content(html_content: str) -> str:
"""清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。"""
soup = BeautifulSoup(html_content, "html.parser")
# Remove all script tags
for script in soup.find_all("script"):
script.decompose()
# Remove all style tags
for style in soup.find_all("style"):
style.decompose()
# Remove all svg tags
for svg in soup.find_all("svg"):
svg.decompose()
# Remove all link tags
for link in soup.find_all("link"):
link.decompose()
# Remove URLs from href and src attributes
for tag in soup.find_all(True):
if "href" in tag.attrs:
del tag["href"]
if "src" in tag.attrs:
del tag["src"]
if "srcset" in tag.attrs:
del tag["srcset"]
if "action" in tag.attrs:
del tag["action"]
data_attrs = [
attr
for attr in tag.attrs
if attr.startswith("data-") and "src" in attr.lower()
]
for attr in data_attrs:
del tag[attr]
# Remove all style attributes from all tags
for tag in soup.find_all(True):
if "style" in tag.attrs:
del tag["style"]
# Remove data-href attributes
for tag in soup.find_all(True):
if "data-href" in tag.attrs:
del tag["data-href"]
# Remove URLs from title attributes
for tag in soup.find_all(True):
if "title" in tag.attrs:
title = tag["title"]
cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
tag["title"] = cleaned_title
# Remove class attributes that contain URL-like patterns
for tag in soup.find_all(True):
if "class" in tag.attrs:
classes = tag["class"]
cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
tag["class"] = cleaned_classes
return str(soup)
def remove_markdown_images(markdown_text: str) -> str:
"""移除 Markdown 文本中的图片标记。"""
return IMAGE_PATTERN.sub("", markdown_text)
def normalize_markdown_whitespace(content: str) -> str:
"""规范化 Markdown 空白字符,保留单行空行。"""
return _CONSECUTIVE_BLANK_LINES.sub("\n\n", content)
def get_heading_level(line: str) -> int:
"""获取 Markdown 行的标题级别1-6非标题返回 0。"""
stripped = line.lstrip()
if not stripped.startswith("#"):
return 0
without_hash = stripped.lstrip("#")
level = len(stripped) - len(without_hash)
if not (1 <= level <= 6):
return 0
if len(stripped) == level:
return level
if stripped[level] != " ":
return 0
return level
def extract_titles(markdown_text: str) -> List[str]:
"""提取 markdown 文本中的所有标题行1-6级"""
title_lines = []
for line in markdown_text.split("\n"):
if get_heading_level(line) > 0:
title_lines.append(line.lstrip())
return title_lines
def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
"""提取所有指定标题及其下级内容(每个包含上级标题)。"""
lines = markdown_text.split("\n")
match_indices = []
for i, line in enumerate(lines):
level = get_heading_level(line)
if level > 0:
stripped = line.lstrip()
title_text = stripped[level:].strip()
if title_text == title_name:
match_indices.append(i)
if not match_indices:
return None
result_lines = []
for match_num, idx in enumerate(match_indices):
if match_num > 0:
result_lines.append("\n---\n")
target_level = get_heading_level(lines[idx])
parent_titles = []
current_level = target_level
for i in range(idx - 1, -1, -1):
line_level = get_heading_level(lines[i])
if line_level > 0 and line_level < current_level:
parent_titles.append(lines[i])
current_level = line_level
if current_level == 1:
break
parent_titles.reverse()
result_lines.extend(parent_titles)
result_lines.append(lines[idx])
for i in range(idx + 1, len(lines)):
line = lines[i]
line_level = get_heading_level(line)
if line_level == 0 or line_level > target_level:
result_lines.append(line)
else:
break
return "\n".join(result_lines)
def search_markdown(
content: str, pattern: str, context_lines: int = 0
) -> Optional[str]:
"""使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文。"""
try:
regex = re.compile(pattern)
except re.error:
return None
lines = content.split("\n")
non_empty_indices = []
non_empty_to_original = {}
for i, line in enumerate(lines):
if line.strip():
non_empty_indices.append(i)
non_empty_to_original[i] = len(non_empty_indices) - 1
matched_non_empty_indices = []
for orig_idx in non_empty_indices:
if regex.search(lines[orig_idx]):
matched_non_empty_indices.append(non_empty_to_original[orig_idx])
if not matched_non_empty_indices:
return None
merged_ranges = []
current_start = matched_non_empty_indices[0]
current_end = matched_non_empty_indices[0]
for idx in matched_non_empty_indices[1:]:
if idx - current_end <= context_lines * 2:
current_end = idx
else:
merged_ranges.append((current_start, current_end))
current_start = idx
current_end = idx
merged_ranges.append((current_start, current_end))
results = []
for start, end in merged_ranges:
context_start_idx = max(0, start - context_lines)
context_end_idx = min(len(non_empty_indices) - 1, end + context_lines)
start_line_idx = non_empty_indices[context_start_idx]
end_line_idx = non_empty_indices[context_end_idx]
result_lines = [
line
for i, line in enumerate(lines)
if start_line_idx <= i <= end_line_idx
]
results.append("\n".join(result_lines))
return "\n---\n".join(results)
def is_url(input_str: str) -> bool:
"""判断输入是否为 URL。"""
return input_str.startswith("http://") or input_str.startswith("https://")
def is_html_file(file_path: str) -> bool:
"""判断文件是否为 HTML 文件(仅检查扩展名)。"""
ext = file_path.lower()
return ext.endswith(".html") or ext.endswith(".htm")