"""HTML 清理模块,用于清理 HTML 内容中的敏感信息。""" import re from bs4 import BeautifulSoup def clean_html_content(html_content: str) -> str: """清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。""" soup = BeautifulSoup(html_content, "html.parser") # Remove all script tags for script in soup.find_all("script"): script.decompose() # Remove all style tags for style in soup.find_all("style"): style.decompose() # Remove all svg tags for svg in soup.find_all("svg"): svg.decompose() # Remove all link tags for link in soup.find_all("link"): link.decompose() # Remove URLs from href and src attributes for tag in soup.find_all(True): if "href" in tag.attrs: del tag["href"] if "src" in tag.attrs: del tag["src"] if "srcset" in tag.attrs: del tag["srcset"] if "action" in tag.attrs: del tag["action"] data_attrs = [ attr for attr in tag.attrs if attr.startswith("data-") and "src" in attr.lower() ] for attr in data_attrs: del tag[attr] # Remove all style attributes from all tags for tag in soup.find_all(True): if "style" in tag.attrs: del tag["style"] # Remove data-href attributes for tag in soup.find_all(True): if "data-href" in tag.attrs: del tag["data-href"] # Remove URLs from title attributes for tag in soup.find_all(True): if "title" in tag.attrs: title = tag["title"] cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE) tag["title"] = cleaned_title # Remove class attributes that contain URL-like patterns for tag in soup.find_all(True): if "class" in tag.attrs: classes = tag["class"] cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c] tag["class"] = cleaned_classes return str(soup)