lyxy-document/scripts/readers/html/cleaner.py

"""HTML 清理模块，用于清理 HTML 内容中的敏感信息。"""

import re
from typing import Optional, Tuple

from bs4 import BeautifulSoup


def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]:
    """
    清理 HTML 内容，移除 script/style/link/svg 标签和 URL 属性。

    Returns:
        (content, error): 成功时返回 (清理后的 HTML, None)，失败时返回 (None, 错误信息)
    """
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        return None, "beautifulsoup4 库未安装"

    try:
        soup = BeautifulSoup(html_content, "html.parser")

        # Remove all script tags
        for script in soup.find_all("script"):
            script.decompose()

        # Remove all style tags
        for style in soup.find_all("style"):
            style.decompose()

        # Remove all svg tags
        for svg in soup.find_all("svg"):
            svg.decompose()

        # Remove all link tags
        for link in soup.find_all("link"):
            link.decompose()

        # Remove URLs from href and src attributes
        for tag in soup.find_all(True):
            if "href" in tag.attrs:
                del tag["href"]
            if "src" in tag.attrs:
                del tag["src"]
            if "srcset" in tag.attrs:
                del tag["srcset"]
            if "action" in tag.attrs:
                del tag["action"]
            data_attrs = [
                attr
                for attr in tag.attrs
                if attr.startswith("data-") and "src" in attr.lower()
            ]
            for attr in data_attrs:
                del tag[attr]

        # Remove all style attributes from all tags
        for tag in soup.find_all(True):
            if "style" in tag.attrs:
                del tag["style"]

        # Remove data-href attributes
        for tag in soup.find_all(True):
            if "data-href" in tag.attrs:
                del tag["data-href"]

        # Remove URLs from title attributes
        for tag in soup.find_all(True):
            if "title" in tag.attrs:
                title = tag["title"]
                cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
                tag["title"] = cleaned_title

        # Remove class attributes that contain URL-like patterns
        for tag in soup.find_all(True):
            if "class" in tag.attrs:
                classes = tag["class"]
                cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
                tag["class"] = cleaned_classes

        content = str(soup)
        if not content.strip():
            return None, "清理后内容为空"
        return content, None
    except Exception as e:
        return None, f"BeautifulSoup 解析失败: {str(e)}"