From 47038475d4641b3bdaabccf2609119dc5a10afb4 Mon Sep 17 00:00:00 2001
From: lanyuanxiaoyao <lanyuanxiaoyao@gmail.com>
Date: Mon, 9 Mar 2026 01:13:42 +0800
Subject: [PATCH] =?UTF-8?q?refactor:=20=E5=B0=86=20HTML=20=E4=B8=8B?=
 =?UTF-8?q?=E8=BD=BD=E5=99=A8=E6=8B=86=E5=88=86=E4=B8=BA=E5=AD=90=E5=8C=85?=
 =?UTF-8?q?=E7=BB=93=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包，各下载器独立维护：

- 创建 downloader/ 子包，包含 __init__.py、common.py 和 4 个下载器模块
- common.py 集中管理公共配置（USER_AGENT、CHROME_ARGS 等）
- 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]]
- 在 __init__.py 定义 DOWNLOADERS 列表显式注册，参考 parser 模式
- 更新 html/__init__.py 导入语句，从 .downloader import download_html
- 添加完整的类型注解，提升代码可维护性
---
 scripts/readers/html/__init__.py             |   4 +-
 scripts/readers/html/downloader.py           | 262 -------------------
 scripts/readers/html/downloader/__init__.py  |  39 +++
 scripts/readers/html/downloader/common.py    |  65 +++++
 scripts/readers/html/downloader/httpx.py     |  38 +++
 scripts/readers/html/downloader/pyppeteer.py |  65 +++++
 scripts/readers/html/downloader/selenium.py  |  92 +++++++
 scripts/readers/html/downloader/urllib.py    |  35 +++
 8 files changed, 336 insertions(+), 264 deletions(-)
 delete mode 100644 scripts/readers/html/downloader.py
 create mode 100644 scripts/readers/html/downloader/__init__.py
 create mode 100644 scripts/readers/html/downloader/common.py
 create mode 100644 scripts/readers/html/downloader/httpx.py
 create mode 100644 scripts/readers/html/downloader/pyppeteer.py
 create mode 100644 scripts/readers/html/downloader/selenium.py
 create mode 100644 scripts/readers/html/downloader/urllib.py

diff --git a/scripts/readers/html/__init__.py b/scripts/readers/html/__init__.py
index fb43d24..0e8a27b 100644
--- a/scripts/readers/html/__init__.py
+++ b/scripts/readers/html/__init__.py
@@ -9,7 +9,7 @@ from scripts.utils import is_url
 from scripts.utils import encoding_detection
 
 from . import cleaner
-from . import downloader
+from .downloader import download_html
 from . import trafilatura
 from . import domscribe
 from . import markitdown
@@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
         # 步骤 1: 获取 HTML 内容
         if is_url(file_path):
             # URL 路径: 下载 HTML
-            html_content, download_failures = downloader.download_html(file_path)
+            html_content, download_failures = download_html(file_path)
             all_failures.extend(download_failures)
             if html_content is None:
                 return None, all_failures
diff --git a/scripts/readers/html/downloader.py b/scripts/readers/html/downloader.py
deleted file mode 100644
index f535114..0000000
--- a/scripts/readers/html/downloader.py
+++ /dev/null
@@ -1,262 +0,0 @@
-"""URL 下载模块，按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。"""
-
-import os
-import asyncio
-import tempfile
-import urllib.request
-import urllib.error
-from typing import Optional, Tuple
-
-
-# 公共配置
-USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
-WINDOW_SIZE = "1920,1080"
-LANGUAGE_SETTING = "zh-CN,zh"
-
-# Chrome 浏览器启动参数（pyppeteer 和 selenium 共用）
-CHROME_ARGS = [
-    "--no-sandbox",
-    "--disable-dev-shm-usage",
-    "--disable-gpu",
-    "--disable-software-rasterizer",
-    "--disable-extensions",
-    "--disable-background-networking",
-    "--disable-default-apps",
-    "--disable-sync",
-    "--disable-translate",
-    "--hide-scrollbars",
-    "--metrics-recording-only",
-    "--mute-audio",
-    "--no-first-run",
-    "--safebrowsing-disable-auto-update",
-    "--blink-settings=imagesEnabled=false",
-    "--disable-plugins",
-    "--disable-ipc-flooding-protection",
-    "--disable-renderer-backgrounding",
-    "--disable-background-timer-throttling",
-    "--disable-hang-monitor",
-    "--disable-prompt-on-repost",
-    "--disable-client-side-phishing-detection",
-    "--disable-component-update",
-    "--disable-domain-reliability",
-    "--disable-features=site-per-process",
-    "--disable-features=IsolateOrigins",
-    "--disable-features=VizDisplayCompositor",
-    "--disable-features=WebRTC",
-    f"--window-size={WINDOW_SIZE}",
-    f"--lang={LANGUAGE_SETTING}",
-    f"--user-agent={USER_AGENT}",
-]
-
-# 隐藏自动化特征的脚本（pyppeteer 和 selenium 共用）
-HIDE_AUTOMATION_SCRIPT = """
-    () => {
-        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
-        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
-        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
-    }
-"""
-
-# pyppeteer 额外的隐藏自动化脚本（包含 notifications 处理）
-HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
-    () => {
-        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
-        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
-        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
-        const originalQuery = window.navigator.permissions.query;
-        window.navigator.permissions.query = (parameters) => (
-            parameters.name === 'notifications' ?
-                Promise.resolve({ state: Notification.permission }) :
-                originalQuery(parameters)
-        );
-    }
-"""
-
-
-def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]:
-    """使用 pyppeteer 下载 URL（支持 JS 渲染）"""
-    try:
-        from pyppeteer import launch
-    except ImportError:
-        return None, "pyppeteer 库未安装"
-
-    async def _download():
-        pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
-        chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
-        if not chromium_path:
-            os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
-        executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
-
-        browser = None
-        try:
-            browser = await launch(
-                headless=True,
-                executablePath=executable_path,
-                args=CHROME_ARGS
-            )
-            page = await browser.newPage()
-
-            await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
-
-            await page.setJavaScriptEnabled(True)
-            await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
-            return await page.content()
-        finally:
-            if browser is not None:
-                try:
-                    await browser.close()
-                except Exception:
-                    pass
-
-    try:
-        content = asyncio.run(_download())
-        if not content or not content.strip():
-            return None, "下载内容为空"
-        return content, None
-    except Exception as e:
-        return None, f"pyppeteer 下载失败: {str(e)}"
-
-
-def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]:
-    """使用 selenium 下载 URL（支持 JS 渲染）"""
-    try:
-        from selenium import webdriver
-        from selenium.webdriver.chrome.service import Service
-        from selenium.webdriver.chrome.options import Options
-        from selenium.webdriver.support.ui import WebDriverWait
-    except ImportError:
-        return None, "selenium 库未安装"
-
-    driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
-    binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
-
-    if not driver_path or not os.path.exists(driver_path):
-        return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
-    if not binary_path or not os.path.exists(binary_path):
-        return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
-
-    chrome_options = Options()
-    chrome_options.binary_location = binary_path
-    chrome_options.add_argument("--headless=new")
-    for arg in CHROME_ARGS:
-        chrome_options.add_argument(arg)
-
-    # 隐藏自动化特征
-    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-    chrome_options.add_experimental_option("useAutomationExtension", False)
-
-    driver = None
-    try:
-        import time
-        service = Service(driver_path)
-        driver = webdriver.Chrome(service=service, options=chrome_options)
-
-        # 隐藏 webdriver 属性
-        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
-            "source": HIDE_AUTOMATION_SCRIPT
-        })
-
-        driver.get(url)
-
-        # 等待页面内容稳定
-        WebDriverWait(driver, 30).until(
-            lambda d: d.execute_script("return document.readyState") == "complete"
-        )
-
-        last_len = 0
-        stable_count = 0
-        for _ in range(30):
-            current_len = len(driver.page_source)
-            if current_len == last_len:
-                stable_count += 1
-                if stable_count >= 2:
-                    break
-            else:
-                stable_count = 0
-                last_len = current_len
-            time.sleep(0.5)
-
-        content = driver.page_source
-        if not content or not content.strip():
-            return None, "下载内容为空"
-        return content, None
-    except Exception as e:
-        return None, f"selenium 下载失败: {str(e)}"
-    finally:
-        if driver is not None:
-            try:
-                driver.quit()
-            except Exception:
-                pass
-
-
-def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]:
-    """使用 httpx 下载 URL（轻量级 HTTP 客户端）"""
-    try:
-        import httpx
-    except ImportError:
-        return None, "httpx 库未安装"
-
-    headers = {
-        "User-Agent": USER_AGENT
-    }
-
-    try:
-        with httpx.Client(timeout=30.0) as client:
-            response = client.get(url, headers=headers)
-            if response.status_code == 200:
-                content = response.text
-                if not content or not content.strip():
-                    return None, "下载内容为空"
-                return content, None
-            return None, f"HTTP {response.status_code}"
-    except Exception as e:
-        return None, f"httpx 下载失败: {str(e)}"
-
-
-def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]:
-    """使用 urllib 下载 URL（标准库，兜底方案）"""
-    headers = {
-        "User-Agent": USER_AGENT
-    }
-
-    try:
-        req = urllib.request.Request(url, headers=headers)
-        with urllib.request.urlopen(req, timeout=30) as response:
-            if response.status == 200:
-                content = response.read().decode("utf-8")
-                if not content or not content.strip():
-                    return None, "下载内容为空"
-                return content, None
-            return None, f"HTTP {response.status}"
-    except Exception as e:
-        return None, f"urllib 下载失败: {str(e)}"
-
-
-def download_html(url: str) -> Tuple[Optional[str], list]:
-    """
-    统一的 HTML 下载入口函数，按优先级尝试各下载器。
-
-    返回: (content, failures)
-    - content: 成功时返回 HTML 内容，失败时返回 None
-    - failures: 各下载器的失败原因列表
-    """
-    failures = []
-    content = None
-
-    # 按优先级尝试各下载器
-    downloaders = [
-        ("pyppeteer", download_with_pyppeteer),
-        ("selenium", download_with_selenium),
-        ("httpx", download_with_httpx),
-        ("urllib", download_with_urllib),
-    ]
-
-    for name, func in downloaders:
-        content, error = func(url)
-        if content is not None:
-            return content, failures
-        else:
-            failures.append(f"- {name}: {error}")
-
-    return None, failures
diff --git a/scripts/readers/html/downloader/__init__.py b/scripts/readers/html/downloader/__init__.py
new file mode 100644
index 0000000..2d4c114
--- /dev/null
+++ b/scripts/readers/html/downloader/__init__.py
@@ -0,0 +1,39 @@
+"""HTML 下载器子包，支持多种下载方式按优先级降级"""
+
+from typing import Optional, Tuple, List
+
+from . import pyppeteer
+from . import selenium
+from . import httpx
+from . import urllib
+
+
+DOWNLOADERS = [
+    ("pyppeteer", pyppeteer.download),
+    ("selenium", selenium.download),
+    ("httpx", httpx.download),
+    ("urllib", urllib.download),
+]
+
+
+def download_html(url: str) -> Tuple[Optional[str], List[str]]:
+    """
+    统一的 HTML 下载入口，按优先级尝试各下载器
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, failures): content 成功时为 HTML 内容，所有失败时为 None
+                            failures 各下载器的失败原因列表
+    """
+    failures: List[str] = []
+
+    for name, func in DOWNLOADERS:
+        content, error = func(url)
+        if content is not None:
+            return content, failures
+        else:
+            failures.append(f"- {name}: {error}")
+
+    return None, failures
diff --git a/scripts/readers/html/downloader/common.py b/scripts/readers/html/downloader/common.py
new file mode 100644
index 0000000..3957411
--- /dev/null
+++ b/scripts/readers/html/downloader/common.py
@@ -0,0 +1,65 @@
+"""下载器公共配置"""
+
+# 公共配置
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+WINDOW_SIZE = "1920,1080"
+LANGUAGE_SETTING = "zh-CN,zh"
+
+# Chrome 浏览器启动参数（pyppeteer 和 selenium 共用）
+CHROME_ARGS = [
+    "--no-sandbox",
+    "--disable-dev-shm-usage",
+    "--disable-gpu",
+    "--disable-software-rasterizer",
+    "--disable-extensions",
+    "--disable-background-networking",
+    "--disable-default-apps",
+    "--disable-sync",
+    "--disable-translate",
+    "--hide-scrollbars",
+    "--metrics-recording-only",
+    "--mute-audio",
+    "--no-first-run",
+    "--safebrowsing-disable-auto-update",
+    "--blink-settings=imagesEnabled=false",
+    "--disable-plugins",
+    "--disable-ipc-flooding-protection",
+    "--disable-renderer-backgrounding",
+    "--disable-background-timer-throttling",
+    "--disable-hang-monitor",
+    "--disable-prompt-on-repost",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-update",
+    "--disable-domain-reliability",
+    "--disable-features=site-per-process",
+    "--disable-features=IsolateOrigins",
+    "--disable-features=VizDisplayCompositor",
+    "--disable-features=WebRTC",
+    f"--window-size={WINDOW_SIZE}",
+    f"--lang={LANGUAGE_SETTING}",
+    f"--user-agent={USER_AGENT}",
+]
+
+# 隐藏自动化特征的脚本（pyppeteer 和 selenium 共用）
+HIDE_AUTOMATION_SCRIPT = """
+    () => {
+        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
+        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
+    }
+"""
+
+# pyppeteer 额外的隐藏自动化脚本（包含 notifications 处理）
+HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
+    () => {
+        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
+        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
+        const originalQuery = window.navigator.permissions.query;
+        window.navigator.permissions.query = (parameters) => (
+            parameters.name === 'notifications' ?
+                Promise.resolve({ state: Notification.permission }) :
+                originalQuery(parameters)
+        );
+    }
+"""
diff --git a/scripts/readers/html/downloader/httpx.py b/scripts/readers/html/downloader/httpx.py
new file mode 100644
index 0000000..a18c29d
--- /dev/null
+++ b/scripts/readers/html/downloader/httpx.py
@@ -0,0 +1,38 @@
+"""使用 httpx 下载 URL（轻量级 HTTP 客户端）"""
+
+from typing import Optional, Tuple
+
+from .common import USER_AGENT
+
+
+def download(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    使用 httpx 下载 URL（轻量级 HTTP 客户端）
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, error): content 成功时为 HTML 内容，失败时为 None
+                         error 成功时为 None，失败时为错误信息
+    """
+    try:
+        import httpx
+    except ImportError:
+        return None, "httpx 库未安装"
+
+    headers = {
+        "User-Agent": USER_AGENT
+    }
+
+    try:
+        with httpx.Client(timeout=30.0) as client:
+            response = client.get(url, headers=headers)
+            if response.status_code == 200:
+                content = response.text
+                if not content or not content.strip():
+                    return None, "下载内容为空"
+                return content, None
+            return None, f"HTTP {response.status_code}"
+    except Exception as e:
+        return None, f"httpx 下载失败: {str(e)}"
diff --git a/scripts/readers/html/downloader/pyppeteer.py b/scripts/readers/html/downloader/pyppeteer.py
new file mode 100644
index 0000000..c4ae4ea
--- /dev/null
+++ b/scripts/readers/html/downloader/pyppeteer.py
@@ -0,0 +1,65 @@
+"""使用 pyppeteer 下载 URL（支持 JS 渲染）"""
+
+import os
+import asyncio
+import tempfile
+from typing import Optional, Tuple
+
+from .common import (
+    USER_AGENT,
+    CHROME_ARGS,
+    HIDE_AUTOMATION_SCRIPT_PUPPETEER
+)
+
+
+def download(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    使用 pyppeteer 下载 URL（支持 JS 渲染）
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, error): content 成功时为 HTML 内容，失败时为 None
+                         error 成功时为 None，失败时为错误信息
+    """
+    try:
+        from pyppeteer import launch
+    except ImportError:
+        return None, "pyppeteer 库未安装"
+
+    async def _download():
+        pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
+        chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+        if not chromium_path:
+            os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
+        executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
+
+        browser = None
+        try:
+            browser = await launch(
+                headless=True,
+                executablePath=executable_path,
+                args=CHROME_ARGS
+            )
+            page = await browser.newPage()
+
+            await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
+
+            await page.setJavaScriptEnabled(True)
+            await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
+            return await page.content()
+        finally:
+            if browser is not None:
+                try:
+                    await browser.close()
+                except Exception:
+                    pass
+
+    try:
+        content = asyncio.run(_download())
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"pyppeteer 下载失败: {str(e)}"
diff --git a/scripts/readers/html/downloader/selenium.py b/scripts/readers/html/downloader/selenium.py
new file mode 100644
index 0000000..b61347a
--- /dev/null
+++ b/scripts/readers/html/downloader/selenium.py
@@ -0,0 +1,92 @@
+"""使用 selenium 下载 URL（支持 JS 渲染）"""
+
+import os
+from typing import Optional, Tuple
+
+from .common import (
+    USER_AGENT,
+    CHROME_ARGS,
+    HIDE_AUTOMATION_SCRIPT
+)
+
+
+def download(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    使用 selenium 下载 URL（支持 JS 渲染）
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, error): content 成功时为 HTML 内容，失败时为 None
+                         error 成功时为 None，失败时为错误信息
+    """
+    try:
+        from selenium import webdriver
+        from selenium.webdriver.chrome.service import Service
+        from selenium.webdriver.chrome.options import Options
+        from selenium.webdriver.support.ui import WebDriverWait
+    except ImportError:
+        return None, "selenium 库未安装"
+
+    driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
+    binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+
+    if not driver_path or not os.path.exists(driver_path):
+        return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
+    if not binary_path or not os.path.exists(binary_path):
+        return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
+
+    chrome_options = Options()
+    chrome_options.binary_location = binary_path
+    chrome_options.add_argument("--headless=new")
+    for arg in CHROME_ARGS:
+        chrome_options.add_argument(arg)
+
+    # 隐藏自动化特征
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option("useAutomationExtension", False)
+
+    driver = None
+    try:
+        import time
+        service = Service(driver_path)
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+
+        # 隐藏 webdriver 属性
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+            "source": HIDE_AUTOMATION_SCRIPT
+        })
+
+        driver.get(url)
+
+        # 等待页面内容稳定
+        WebDriverWait(driver, 30).until(
+            lambda d: d.execute_script("return document.readyState") == "complete"
+        )
+
+        last_len = 0
+        stable_count = 0
+        for _ in range(30):
+            current_len = len(driver.page_source)
+            if current_len == last_len:
+                stable_count += 1
+                if stable_count >= 2:
+                    break
+            else:
+                stable_count = 0
+                last_len = current_len
+            time.sleep(0.5)
+
+        content = driver.page_source
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"selenium 下载失败: {str(e)}"
+    finally:
+        if driver is not None:
+            try:
+                driver.quit()
+            except Exception:
+                pass
diff --git a/scripts/readers/html/downloader/urllib.py b/scripts/readers/html/downloader/urllib.py
new file mode 100644
index 0000000..25bc18e
--- /dev/null
+++ b/scripts/readers/html/downloader/urllib.py
@@ -0,0 +1,35 @@
+"""使用 urllib 下载 URL（标准库，兜底方案）"""
+
+import urllib.request
+import urllib.error
+from typing import Optional, Tuple
+
+from .common import USER_AGENT
+
+
+def download(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    使用 urllib 下载 URL（标准库，兜底方案）
+
+    Args:
+        url: 目标 URL
+
+    Returns:
+        (content, error): content 成功时为 HTML 内容，失败时为 None
+                         error 成功时为 None，失败时为错误信息
+    """
+    headers = {
+        "User-Agent": USER_AGENT
+    }
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as response:
+            if response.status == 200:
+                content = response.read().decode("utf-8")
+                if not content or not content.strip():
+                    return None, "下载内容为空"
+                return content, None
+            return None, f"HTTP {response.status}"
+    except Exception as e:
+        return None, f"urllib 下载失败: {str(e)}"