- 新增 skill: lyxy-reader-html,用于解析 HTML 文件和 URL 网页内容 - 支持 URL 下载(pyppeteer → selenium → httpx → urllib 优先级回退) - 支持 HTML 解析(trafilatura → domscribe → MarkItDown → html2text 优先级回退) - 支持查询功能:全文提取、字数统计、行数统计、标题提取、章节提取、正则搜索 - 新增 spec: html-document-parsing - 归档 change: create-lyxy-reader-html-skill
226 lines
6.9 KiB
Python
226 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
||
"""HTML 解析器的公共模块,包含 HTML 清理、Markdown 处理等工具函数。"""
|
||
|
||
import re
|
||
from typing import List, Optional
|
||
from bs4 import BeautifulSoup
|
||
|
||
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
||
_CONSECUTIVE_BLANK_LINES = re.compile(r"\n{3,}")
|
||
|
||
|
||
def clean_html_content(html_content: str) -> str:
|
||
"""清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。"""
|
||
soup = BeautifulSoup(html_content, "html.parser")
|
||
|
||
# Remove all script tags
|
||
for script in soup.find_all("script"):
|
||
script.decompose()
|
||
|
||
# Remove all style tags
|
||
for style in soup.find_all("style"):
|
||
style.decompose()
|
||
|
||
# Remove all svg tags
|
||
for svg in soup.find_all("svg"):
|
||
svg.decompose()
|
||
|
||
# Remove all link tags
|
||
for link in soup.find_all("link"):
|
||
link.decompose()
|
||
|
||
# Remove URLs from href and src attributes
|
||
for tag in soup.find_all(True):
|
||
if "href" in tag.attrs:
|
||
del tag["href"]
|
||
if "src" in tag.attrs:
|
||
del tag["src"]
|
||
if "srcset" in tag.attrs:
|
||
del tag["srcset"]
|
||
if "action" in tag.attrs:
|
||
del tag["action"]
|
||
data_attrs = [
|
||
attr
|
||
for attr in tag.attrs
|
||
if attr.startswith("data-") and "src" in attr.lower()
|
||
]
|
||
for attr in data_attrs:
|
||
del tag[attr]
|
||
|
||
# Remove all style attributes from all tags
|
||
for tag in soup.find_all(True):
|
||
if "style" in tag.attrs:
|
||
del tag["style"]
|
||
|
||
# Remove data-href attributes
|
||
for tag in soup.find_all(True):
|
||
if "data-href" in tag.attrs:
|
||
del tag["data-href"]
|
||
|
||
# Remove URLs from title attributes
|
||
for tag in soup.find_all(True):
|
||
if "title" in tag.attrs:
|
||
title = tag["title"]
|
||
cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
|
||
tag["title"] = cleaned_title
|
||
|
||
# Remove class attributes that contain URL-like patterns
|
||
for tag in soup.find_all(True):
|
||
if "class" in tag.attrs:
|
||
classes = tag["class"]
|
||
cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
|
||
tag["class"] = cleaned_classes
|
||
|
||
return str(soup)
|
||
|
||
|
||
def remove_markdown_images(markdown_text: str) -> str:
|
||
"""移除 Markdown 文本中的图片标记。"""
|
||
return IMAGE_PATTERN.sub("", markdown_text)
|
||
|
||
|
||
def normalize_markdown_whitespace(content: str) -> str:
|
||
"""规范化 Markdown 空白字符,保留单行空行。"""
|
||
return _CONSECUTIVE_BLANK_LINES.sub("\n\n", content)
|
||
|
||
|
||
def get_heading_level(line: str) -> int:
|
||
"""获取 Markdown 行的标题级别(1-6),非标题返回 0。"""
|
||
stripped = line.lstrip()
|
||
if not stripped.startswith("#"):
|
||
return 0
|
||
without_hash = stripped.lstrip("#")
|
||
level = len(stripped) - len(without_hash)
|
||
if not (1 <= level <= 6):
|
||
return 0
|
||
if len(stripped) == level:
|
||
return level
|
||
if stripped[level] != " ":
|
||
return 0
|
||
return level
|
||
|
||
|
||
def extract_titles(markdown_text: str) -> List[str]:
|
||
"""提取 markdown 文本中的所有标题行(1-6级)。"""
|
||
title_lines = []
|
||
for line in markdown_text.split("\n"):
|
||
if get_heading_level(line) > 0:
|
||
title_lines.append(line.lstrip())
|
||
return title_lines
|
||
|
||
|
||
def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
|
||
"""提取所有指定标题及其下级内容(每个包含上级标题)。"""
|
||
lines = markdown_text.split("\n")
|
||
match_indices = []
|
||
|
||
for i, line in enumerate(lines):
|
||
level = get_heading_level(line)
|
||
if level > 0:
|
||
stripped = line.lstrip()
|
||
title_text = stripped[level:].strip()
|
||
if title_text == title_name:
|
||
match_indices.append(i)
|
||
|
||
if not match_indices:
|
||
return None
|
||
|
||
result_lines = []
|
||
for match_num, idx in enumerate(match_indices):
|
||
if match_num > 0:
|
||
result_lines.append("\n---\n")
|
||
|
||
target_level = get_heading_level(lines[idx])
|
||
|
||
parent_titles = []
|
||
current_level = target_level
|
||
for i in range(idx - 1, -1, -1):
|
||
line_level = get_heading_level(lines[i])
|
||
if line_level > 0 and line_level < current_level:
|
||
parent_titles.append(lines[i])
|
||
current_level = line_level
|
||
if current_level == 1:
|
||
break
|
||
|
||
parent_titles.reverse()
|
||
result_lines.extend(parent_titles)
|
||
|
||
result_lines.append(lines[idx])
|
||
for i in range(idx + 1, len(lines)):
|
||
line = lines[i]
|
||
line_level = get_heading_level(line)
|
||
if line_level == 0 or line_level > target_level:
|
||
result_lines.append(line)
|
||
else:
|
||
break
|
||
|
||
return "\n".join(result_lines)
|
||
|
||
|
||
def search_markdown(
|
||
content: str, pattern: str, context_lines: int = 0
|
||
) -> Optional[str]:
|
||
"""使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文。"""
|
||
try:
|
||
regex = re.compile(pattern)
|
||
except re.error:
|
||
return None
|
||
|
||
lines = content.split("\n")
|
||
|
||
non_empty_indices = []
|
||
non_empty_to_original = {}
|
||
for i, line in enumerate(lines):
|
||
if line.strip():
|
||
non_empty_indices.append(i)
|
||
non_empty_to_original[i] = len(non_empty_indices) - 1
|
||
|
||
matched_non_empty_indices = []
|
||
for orig_idx in non_empty_indices:
|
||
if regex.search(lines[orig_idx]):
|
||
matched_non_empty_indices.append(non_empty_to_original[orig_idx])
|
||
|
||
if not matched_non_empty_indices:
|
||
return None
|
||
|
||
merged_ranges = []
|
||
current_start = matched_non_empty_indices[0]
|
||
current_end = matched_non_empty_indices[0]
|
||
|
||
for idx in matched_non_empty_indices[1:]:
|
||
if idx - current_end <= context_lines * 2:
|
||
current_end = idx
|
||
else:
|
||
merged_ranges.append((current_start, current_end))
|
||
current_start = idx
|
||
current_end = idx
|
||
merged_ranges.append((current_start, current_end))
|
||
|
||
results = []
|
||
for start, end in merged_ranges:
|
||
context_start_idx = max(0, start - context_lines)
|
||
context_end_idx = min(len(non_empty_indices) - 1, end + context_lines)
|
||
|
||
start_line_idx = non_empty_indices[context_start_idx]
|
||
end_line_idx = non_empty_indices[context_end_idx]
|
||
|
||
result_lines = [
|
||
line
|
||
for i, line in enumerate(lines)
|
||
if start_line_idx <= i <= end_line_idx
|
||
]
|
||
results.append("\n".join(result_lines))
|
||
|
||
return "\n---\n".join(results)
|
||
|
||
|
||
def is_url(input_str: str) -> bool:
|
||
"""判断输入是否为 URL。"""
|
||
return input_str.startswith("http://") or input_str.startswith("https://")
|
||
|
||
|
||
def is_html_file(file_path: str) -> bool:
|
||
"""判断文件是否为 HTML 文件(仅检查扩展名)。"""
|
||
ext = file_path.lower()
|
||
return ext.endswith(".html") or ext.endswith(".htm")
|