refactor: 将 HTML 下载器拆分为子包结构

将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包，各下载器独立维护： - 创建 downloader/ 子包，包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置（USER_AGENT、CHROME_ARGS 等） - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册，参考 parser 模式 - 更新 html/__init__.py 导入语句，从 .downloader import download_html - 添加完整的类型注解，提升代码可维护性
2026-03-09 01:13:42 +08:00
parent 1aea561277
commit 47038475d4
8 changed files with 336 additions and 264 deletions
--- a/scripts/readers/html/init.py
+++ b/scripts/readers/html/init.py
@@ -9,7 +9,7 @@ from scripts.utils import is_url
 from scripts.utils import encoding_detection

 from . import cleaner
-from . import downloader
+from .downloader import download_html
 from . import trafilatura
 from . import domscribe
 from . import markitdown
@@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
        # 步骤 1: 获取 HTML 内容
        if is_url(file_path):
            # URL 路径: 下载 HTML
-            html_content, download_failures = downloader.download_html(file_path)
+            html_content, download_failures = download_html(file_path)
            all_failures.extend(download_failures)
            if html_content is None:
                return None, all_failures