"""HTML 清理模块,用于清理 HTML 内容中的敏感信息。""" import re from typing import Optional, Tuple from bs4 import BeautifulSoup def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]: """ 清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。 Returns: (content, error): 成功时返回 (清理后的 HTML, None),失败时返回 (None, 错误信息) """ try: from bs4 import BeautifulSoup except ImportError: return None, "beautifulsoup4 库未安装" try: soup = BeautifulSoup(html_content, "html.parser") # Remove all script tags for script in soup.find_all("script"): script.decompose() # Remove all style tags for style in soup.find_all("style"): style.decompose() # Remove all svg tags for svg in soup.find_all("svg"): svg.decompose() # Remove all link tags for link in soup.find_all("link"): link.decompose() # Remove URLs from href and src attributes for tag in soup.find_all(True): if "href" in tag.attrs: del tag["href"] if "src" in tag.attrs: del tag["src"] if "srcset" in tag.attrs: del tag["srcset"] if "action" in tag.attrs: del tag["action"] data_attrs = [ attr for attr in tag.attrs if attr.startswith("data-") and "src" in attr.lower() ] for attr in data_attrs: del tag[attr] # Remove all style attributes from all tags for tag in soup.find_all(True): if "style" in tag.attrs: del tag["style"] # Remove data-href attributes for tag in soup.find_all(True): if "data-href" in tag.attrs: del tag["data-href"] # Remove URLs from title attributes for tag in soup.find_all(True): if "title" in tag.attrs: title = tag["title"] cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE) tag["title"] = cleaned_title # Remove class attributes that contain URL-like patterns for tag in soup.find_all(True): if "class" in tag.attrs: classes = tag["class"] cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c] tag["class"] = cleaned_classes content = str(soup) if not content.strip(): return None, "清理后内容为空" return content, None except Exception as e: return None, f"BeautifulSoup 解析失败: {str(e)}"