refactor: 将核心代码迁移到 scripts 目录

- 创建 scripts/ 目录作为核心代码根目录 - 移动 core/, readers/, utils/ 到 scripts/ 下 - 移动 config.py, lyxy_document_reader.py 到 scripts/ - 移动 encoding_detection.py 到 scripts/utils/ - 更新 pyproject.toml 中的入口点路径和 pytest 配置 - 更新所有内部导入语句为 scripts.* 模块 - 更新 README.md 目录结构说明 - 更新 openspec/config.yaml 添加目录结构说明 - 删除无用的 main.py 此变更使项目结构更清晰，便于区分核心代码与测试、文档等支撑文件。
2026-03-08 17:41:03 +08:00
parent 750ef50a8d
commit 15b63800a8
50 changed files with 66 additions and 60 deletions
--- a/scripts/readers/html/cleaner.py
+++ b/scripts/readers/html/cleaner.py
@@ -0,0 +1,69 @@
+"""HTML 清理模块，用于清理 HTML 内容中的敏感信息。"""
+
+import re
+from bs4 import BeautifulSoup
+
+
+def clean_html_content(html_content: str) -> str:
+    """清理 HTML 内容，移除 script/style/link/svg 标签和 URL 属性。"""
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # Remove all script tags
+    for script in soup.find_all("script"):
+        script.decompose()
+
+    # Remove all style tags
+    for style in soup.find_all("style"):
+        style.decompose()
+
+    # Remove all svg tags
+    for svg in soup.find_all("svg"):
+        svg.decompose()
+
+    # Remove all link tags
+    for link in soup.find_all("link"):
+        link.decompose()
+
+    # Remove URLs from href and src attributes
+    for tag in soup.find_all(True):
+        if "href" in tag.attrs:
+            del tag["href"]
+        if "src" in tag.attrs:
+            del tag["src"]
+        if "srcset" in tag.attrs:
+            del tag["srcset"]
+        if "action" in tag.attrs:
+            del tag["action"]
+        data_attrs = [
+            attr
+            for attr in tag.attrs
+            if attr.startswith("data-") and "src" in attr.lower()
+        ]
+        for attr in data_attrs:
+            del tag[attr]
+
+    # Remove all style attributes from all tags
+    for tag in soup.find_all(True):
+        if "style" in tag.attrs:
+            del tag["style"]
+
+    # Remove data-href attributes
+    for tag in soup.find_all(True):
+        if "data-href" in tag.attrs:
+            del tag["data-href"]
+
+    # Remove URLs from title attributes
+    for tag in soup.find_all(True):
+        if "title" in tag.attrs:
+            title = tag["title"]
+            cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
+            tag["title"] = cleaned_title
+
+    # Remove class attributes that contain URL-like patterns
+    for tag in soup.find_all(True):
+        if "class" in tag.attrs:
+            classes = tag["class"]
+            cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
+            tag["class"] = cleaned_classes
+
+    return str(soup)