lyxy-document/readers/html/cleaner.py

"""HTML 清理模块，用于清理 HTML 内容中的敏感信息。"""

import re
from bs4 import BeautifulSoup


def clean_html_content(html_content: str) -> str:
    """清理 HTML 内容，移除 script/style/link/svg 标签和 URL 属性。"""
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove all script tags
    for script in soup.find_all("script"):
        script.decompose()

    # Remove all style tags
    for style in soup.find_all("style"):
        style.decompose()

    # Remove all svg tags
    for svg in soup.find_all("svg"):
        svg.decompose()

    # Remove all link tags
    for link in soup.find_all("link"):
        link.decompose()

    # Remove URLs from href and src attributes
    for tag in soup.find_all(True):
        if "href" in tag.attrs:
            del tag["href"]
        if "src" in tag.attrs:
            del tag["src"]
        if "srcset" in tag.attrs:
            del tag["srcset"]
        if "action" in tag.attrs:
            del tag["action"]
        data_attrs = [
            attr
            for attr in tag.attrs
            if attr.startswith("data-") and "src" in attr.lower()
        ]
        for attr in data_attrs:
            del tag[attr]

    # Remove all style attributes from all tags
    for tag in soup.find_all(True):
        if "style" in tag.attrs:
            del tag["style"]

    # Remove data-href attributes
    for tag in soup.find_all(True):
        if "data-href" in tag.attrs:
            del tag["data-href"]

    # Remove URLs from title attributes
    for tag in soup.find_all(True):
        if "title" in tag.attrs:
            title = tag["title"]
            cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
            tag["title"] = cleaned_title

    # Remove class attributes that contain URL-like patterns
    for tag in soup.find_all(True):
        if "class" in tag.attrs:
            classes = tag["class"]
            cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
            tag["class"] = cleaned_classes

    return str(soup)