将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护: - 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等) - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式 - 更新 html/__init__.py 导入语句,从 .downloader import download_html - 添加完整的类型注解,提升代码可维护性
66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
"""使用 pyppeteer 下载 URL(支持 JS 渲染)"""
|
||
|
||
import os
|
||
import asyncio
|
||
import tempfile
|
||
from typing import Optional, Tuple
|
||
|
||
from .common import (
|
||
USER_AGENT,
|
||
CHROME_ARGS,
|
||
HIDE_AUTOMATION_SCRIPT_PUPPETEER
|
||
)
|
||
|
||
|
||
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||
"""
|
||
使用 pyppeteer 下载 URL(支持 JS 渲染)
|
||
|
||
Args:
|
||
url: 目标 URL
|
||
|
||
Returns:
|
||
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||
error 成功时为 None,失败时为错误信息
|
||
"""
|
||
try:
|
||
from pyppeteer import launch
|
||
except ImportError:
|
||
return None, "pyppeteer 库未安装"
|
||
|
||
async def _download():
|
||
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
|
||
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
||
if not chromium_path:
|
||
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
|
||
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
|
||
|
||
browser = None
|
||
try:
|
||
browser = await launch(
|
||
headless=True,
|
||
executablePath=executable_path,
|
||
args=CHROME_ARGS
|
||
)
|
||
page = await browser.newPage()
|
||
|
||
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
|
||
|
||
await page.setJavaScriptEnabled(True)
|
||
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
|
||
return await page.content()
|
||
finally:
|
||
if browser is not None:
|
||
try:
|
||
await browser.close()
|
||
except Exception:
|
||
pass
|
||
|
||
try:
|
||
content = asyncio.run(_download())
|
||
if not content or not content.strip():
|
||
return None, "下载内容为空"
|
||
return content, None
|
||
except Exception as e:
|
||
return None, f"pyppeteer 下载失败: {str(e)}"
|