refactor: 将 HTML 下载器拆分为子包结构

将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包，各下载器独立维护： - 创建 downloader/ 子包，包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置（USER_AGENT、CHROME_ARGS 等） - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册，参考 parser 模式 - 更新 html/__init__.py 导入语句，从 .downloader import download_html - 添加完整的类型注解，提升代码可维护性
2026-03-09 01:13:42 +08:00
parent 1aea561277
commit 47038475d4
8 changed files with 336 additions and 264 deletions
--- a/scripts/readers/html/downloader/init.py
+++ b/scripts/readers/html/downloader/init.py
@@ -0,0 +1,39 @@
+"""HTML 下载器子包，支持多种下载方式按优先级降级"""
+
+from typing import Optional, Tuple, List
+
+from . import pyppeteer
+from . import selenium
+from . import httpx
+from . import urllib
+
+
+DOWNLOADERS = [
+    ("pyppeteer", pyppeteer.download),
+    ("selenium", selenium.download),
+    ("httpx", httpx.download),
+    ("urllib", urllib.download),
+]
+
+
+def download_html(url: str) -> Tuple[Optional[str], List[str]]:
+    """
+    统一的 HTML 下载入口，按优先级尝试各下载器
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, failures): content 成功时为 HTML 内容，所有失败时为 None
+                            failures 各下载器的失败原因列表
+    """
+    failures: List[str] = []
+
+    for name, func in DOWNLOADERS:
+        content, error = func(url)
+        if content is not None:
+            return content, failures
+        else:
+            failures.append(f"- {name}: {error}")
+
+    return None, failures