Files
lyxy-document/scripts/readers/html/downloader/pyppeteer.py
lanyuanxiaoyao 47038475d4 refactor: 将 HTML 下载器拆分为子包结构
将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护:

- 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块
- common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等)
- 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]]
- 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式
- 更新 html/__init__.py 导入语句,从 .downloader import download_html
- 添加完整的类型注解,提升代码可维护性
2026-03-09 01:13:42 +08:00

66 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""使用 pyppeteer 下载 URL支持 JS 渲染)"""
import os
import asyncio
import tempfile
from typing import Optional, Tuple
from .common import (
USER_AGENT,
CHROME_ARGS,
HIDE_AUTOMATION_SCRIPT_PUPPETEER
)
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
"""
使用 pyppeteer 下载 URL支持 JS 渲染)
Args:
url: 目标 URL
Returns:
(content, error): content 成功时为 HTML 内容,失败时为 None
error 成功时为 None失败时为错误信息
"""
try:
from pyppeteer import launch
except ImportError:
return None, "pyppeteer 库未安装"
async def _download():
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
if not chromium_path:
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
browser = None
try:
browser = await launch(
headless=True,
executablePath=executable_path,
args=CHROME_ARGS
)
page = await browser.newPage()
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
await page.setJavaScriptEnabled(True)
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
return await page.content()
finally:
if browser is not None:
try:
await browser.close()
except Exception:
pass
try:
content = asyncio.run(_download())
if not content or not content.strip():
return None, "下载内容为空"
return content, None
except Exception as e:
return None, f"pyppeteer 下载失败: {str(e)}"