refactor: 将 HTML 下载器拆分为子包结构

将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包，各下载器独立维护： - 创建 downloader/ 子包，包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置（USER_AGENT、CHROME_ARGS 等） - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册，参考 parser 模式 - 更新 html/__init__.py 导入语句，从 .downloader import download_html - 添加完整的类型注解，提升代码可维护性
2026-03-09 01:13:42 +08:00
parent 1aea561277
commit 47038475d4
8 changed files with 336 additions and 264 deletions
--- a/scripts/readers/html/downloader/selenium.py
+++ b/scripts/readers/html/downloader/selenium.py
@@ -0,0 +1,92 @@
+"""使用 selenium 下载 URL（支持 JS 渲染）"""
+
+import os
+from typing import Optional, Tuple
+
+from .common import (
+    USER_AGENT,
+    CHROME_ARGS,
+    HIDE_AUTOMATION_SCRIPT
+)
+
+
+def download(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    使用 selenium 下载 URL（支持 JS 渲染）
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, error): content 成功时为 HTML 内容，失败时为 None
+                         error 成功时为 None，失败时为错误信息
+    """
+    try:
+        from selenium import webdriver
+        from selenium.webdriver.chrome.service import Service
+        from selenium.webdriver.chrome.options import Options
+        from selenium.webdriver.support.ui import WebDriverWait
+    except ImportError:
+        return None, "selenium 库未安装"
+
+    driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
+    binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+
+    if not driver_path or not os.path.exists(driver_path):
+        return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
+    if not binary_path or not os.path.exists(binary_path):
+        return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
+
+    chrome_options = Options()
+    chrome_options.binary_location = binary_path
+    chrome_options.add_argument("--headless=new")
+    for arg in CHROME_ARGS:
+        chrome_options.add_argument(arg)
+
+    # 隐藏自动化特征
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option("useAutomationExtension", False)
+
+    driver = None
+    try:
+        import time
+        service = Service(driver_path)
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+
+        # 隐藏 webdriver 属性
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+            "source": HIDE_AUTOMATION_SCRIPT
+        })
+
+        driver.get(url)
+
+        # 等待页面内容稳定
+        WebDriverWait(driver, 30).until(
+            lambda d: d.execute_script("return document.readyState") == "complete"
+        )
+
+        last_len = 0
+        stable_count = 0
+        for _ in range(30):
+            current_len = len(driver.page_source)
+            if current_len == last_len:
+                stable_count += 1
+                if stable_count >= 2:
+                    break
+            else:
+                stable_count = 0
+                last_len = current_len
+            time.sleep(0.5)
+
+        content = driver.page_source
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"selenium 下载失败: {str(e)}"
+    finally:
+        if driver is not None:
+            try:
+                driver.quit()
+            except Exception:
+                pass