refactor: 将 HTML 下载器拆分为子包结构
将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护: - 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等) - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式 - 更新 html/__init__.py 导入语句,从 .downloader import download_html - 添加完整的类型注解,提升代码可维护性
This commit is contained in:
39
scripts/readers/html/downloader/__init__.py
Normal file
39
scripts/readers/html/downloader/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""HTML 下载器子包,支持多种下载方式按优先级降级"""
|
||||
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
from . import pyppeteer
|
||||
from . import selenium
|
||||
from . import httpx
|
||||
from . import urllib
|
||||
|
||||
|
||||
DOWNLOADERS = [
|
||||
("pyppeteer", pyppeteer.download),
|
||||
("selenium", selenium.download),
|
||||
("httpx", httpx.download),
|
||||
("urllib", urllib.download),
|
||||
]
|
||||
|
||||
|
||||
def download_html(url: str) -> Tuple[Optional[str], List[str]]:
|
||||
"""
|
||||
统一的 HTML 下载入口,按优先级尝试各下载器
|
||||
|
||||
Args:
|
||||
url: 目标 URL
|
||||
|
||||
Returns:
|
||||
(content, failures): content 成功时为 HTML 内容,所有失败时为 None
|
||||
failures 各下载器的失败原因列表
|
||||
"""
|
||||
failures: List[str] = []
|
||||
|
||||
for name, func in DOWNLOADERS:
|
||||
content, error = func(url)
|
||||
if content is not None:
|
||||
return content, failures
|
||||
else:
|
||||
failures.append(f"- {name}: {error}")
|
||||
|
||||
return None, failures
|
||||
Reference in New Issue
Block a user