refactor: 将 HTML 下载器拆分为子包结构
将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护: - 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等) - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式 - 更新 html/__init__.py 导入语句,从 .downloader import download_html - 添加完整的类型注解,提升代码可维护性
This commit is contained in:
92
scripts/readers/html/downloader/selenium.py
Normal file
92
scripts/readers/html/downloader/selenium.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""使用 selenium 下载 URL(支持 JS 渲染)"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from .common import (
|
||||
USER_AGENT,
|
||||
CHROME_ARGS,
|
||||
HIDE_AUTOMATION_SCRIPT
|
||||
)
|
||||
|
||||
|
||||
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
使用 selenium 下载 URL(支持 JS 渲染)
|
||||
|
||||
Args:
|
||||
url: 目标 URL
|
||||
|
||||
Returns:
|
||||
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||
error 成功时为 None,失败时为错误信息
|
||||
"""
|
||||
try:
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
except ImportError:
|
||||
return None, "selenium 库未安装"
|
||||
|
||||
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
|
||||
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
||||
|
||||
if not driver_path or not os.path.exists(driver_path):
|
||||
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
|
||||
if not binary_path or not os.path.exists(binary_path):
|
||||
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.binary_location = binary_path
|
||||
chrome_options.add_argument("--headless=new")
|
||||
for arg in CHROME_ARGS:
|
||||
chrome_options.add_argument(arg)
|
||||
|
||||
# 隐藏自动化特征
|
||||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
chrome_options.add_experimental_option("useAutomationExtension", False)
|
||||
|
||||
driver = None
|
||||
try:
|
||||
import time
|
||||
service = Service(driver_path)
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
# 隐藏 webdriver 属性
|
||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": HIDE_AUTOMATION_SCRIPT
|
||||
})
|
||||
|
||||
driver.get(url)
|
||||
|
||||
# 等待页面内容稳定
|
||||
WebDriverWait(driver, 30).until(
|
||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
last_len = 0
|
||||
stable_count = 0
|
||||
for _ in range(30):
|
||||
current_len = len(driver.page_source)
|
||||
if current_len == last_len:
|
||||
stable_count += 1
|
||||
if stable_count >= 2:
|
||||
break
|
||||
else:
|
||||
stable_count = 0
|
||||
last_len = current_len
|
||||
time.sleep(0.5)
|
||||
|
||||
content = driver.page_source
|
||||
if not content or not content.strip():
|
||||
return None, "下载内容为空"
|
||||
return content, None
|
||||
except Exception as e:
|
||||
return None, f"selenium 下载失败: {str(e)}"
|
||||
finally:
|
||||
if driver is not None:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
Reference in New Issue
Block a user