Files
lyxy-document/scripts/readers/html/downloader/selenium.py
lanyuanxiaoyao 47038475d4 refactor: 将 HTML 下载器拆分为子包结构
将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护:

- 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块
- common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等)
- 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]]
- 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式
- 更新 html/__init__.py 导入语句,从 .downloader import download_html
- 添加完整的类型注解,提升代码可维护性
2026-03-09 01:13:42 +08:00

93 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""使用 selenium 下载 URL支持 JS 渲染)"""
import os
from typing import Optional, Tuple
from .common import (
USER_AGENT,
CHROME_ARGS,
HIDE_AUTOMATION_SCRIPT
)
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
"""
使用 selenium 下载 URL支持 JS 渲染)
Args:
url: 目标 URL
Returns:
(content, error): content 成功时为 HTML 内容,失败时为 None
error 成功时为 None失败时为错误信息
"""
try:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
except ImportError:
return None, "selenium 库未安装"
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
if not driver_path or not os.path.exists(driver_path):
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
if not binary_path or not os.path.exists(binary_path):
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
chrome_options = Options()
chrome_options.binary_location = binary_path
chrome_options.add_argument("--headless=new")
for arg in CHROME_ARGS:
chrome_options.add_argument(arg)
# 隐藏自动化特征
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
driver = None
try:
import time
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
# 隐藏 webdriver 属性
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": HIDE_AUTOMATION_SCRIPT
})
driver.get(url)
# 等待页面内容稳定
WebDriverWait(driver, 30).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
last_len = 0
stable_count = 0
for _ in range(30):
current_len = len(driver.page_source)
if current_len == last_len:
stable_count += 1
if stable_count >= 2:
break
else:
stable_count = 0
last_len = current_len
time.sleep(0.5)
content = driver.page_source
if not content or not content.strip():
return None, "下载内容为空"
return content, None
except Exception as e:
return None, f"selenium 下载失败: {str(e)}"
finally:
if driver is not None:
try:
driver.quit()
except Exception:
pass