refactor: 将 HTML 下载器拆分为子包结构

将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护:

- 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块
- common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等)
- 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]]
- 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式
- 更新 html/__init__.py 导入语句,从 .downloader import download_html
- 添加完整的类型注解,提升代码可维护性
This commit is contained in:
2026-03-09 01:13:42 +08:00
parent 1aea561277
commit 47038475d4
8 changed files with 336 additions and 264 deletions

View File

@@ -0,0 +1,65 @@
"""使用 pyppeteer 下载 URL支持 JS 渲染)"""
import os
import asyncio
import tempfile
from typing import Optional, Tuple
from .common import (
USER_AGENT,
CHROME_ARGS,
HIDE_AUTOMATION_SCRIPT_PUPPETEER
)
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
"""
使用 pyppeteer 下载 URL支持 JS 渲染)
Args:
url: 目标 URL
Returns:
(content, error): content 成功时为 HTML 内容,失败时为 None
error 成功时为 None失败时为错误信息
"""
try:
from pyppeteer import launch
except ImportError:
return None, "pyppeteer 库未安装"
async def _download():
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
if not chromium_path:
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
browser = None
try:
browser = await launch(
headless=True,
executablePath=executable_path,
args=CHROME_ARGS
)
page = await browser.newPage()
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
await page.setJavaScriptEnabled(True)
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
return await page.content()
finally:
if browser is not None:
try:
await browser.close()
except Exception:
pass
try:
content = asyncio.run(_download())
if not content or not content.strip():
return None, "下载内容为空"
return content, None
except Exception as e:
return None, f"pyppeteer 下载失败: {str(e)}"