refactor: 将 HTML 下载器拆分为子包结构
将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护: - 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等) - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式 - 更新 html/__init__.py 导入语句,从 .downloader import download_html - 添加完整的类型注解,提升代码可维护性
This commit is contained in:
35
scripts/readers/html/downloader/urllib.py
Normal file
35
scripts/readers/html/downloader/urllib.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""使用 urllib 下载 URL(标准库,兜底方案)"""
|
||||
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from .common import USER_AGENT
|
||||
|
||||
|
||||
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
使用 urllib 下载 URL(标准库,兜底方案)
|
||||
|
||||
Args:
|
||||
url: 目标 URL
|
||||
|
||||
Returns:
|
||||
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||
error 成功时为 None,失败时为错误信息
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": USER_AGENT
|
||||
}
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
content = response.read().decode("utf-8")
|
||||
if not content or not content.strip():
|
||||
return None, "下载内容为空"
|
||||
return content, None
|
||||
return None, f"HTTP {response.status}"
|
||||
except Exception as e:
|
||||
return None, f"urllib 下载失败: {str(e)}"
|
||||
Reference in New Issue
Block a user