"""URL 下载模块,按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。""" import os import asyncio import tempfile import urllib.request import urllib.error from typing import Optional, Tuple # 公共配置 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" WINDOW_SIZE = "1920,1080" LANGUAGE_SETTING = "zh-CN,zh" # Chrome 浏览器启动参数(pyppeteer 和 selenium 共用) CHROME_ARGS = [ "--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu", "--disable-software-rasterizer", "--disable-extensions", "--disable-background-networking", "--disable-default-apps", "--disable-sync", "--disable-translate", "--hide-scrollbars", "--metrics-recording-only", "--mute-audio", "--no-first-run", "--safebrowsing-disable-auto-update", "--blink-settings=imagesEnabled=false", "--disable-plugins", "--disable-ipc-flooding-protection", "--disable-renderer-backgrounding", "--disable-background-timer-throttling", "--disable-hang-monitor", "--disable-prompt-on-repost", "--disable-client-side-phishing-detection", "--disable-component-update", "--disable-domain-reliability", "--disable-features=site-per-process", "--disable-features=IsolateOrigins", "--disable-features=VizDisplayCompositor", "--disable-features=WebRTC", f"--window-size={WINDOW_SIZE}", f"--lang={LANGUAGE_SETTING}", f"--user-agent={USER_AGENT}", ] # 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用) HIDE_AUTOMATION_SCRIPT = """ () => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] }); } """ # pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理) HIDE_AUTOMATION_SCRIPT_PUPPETEER = """ () => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] }); const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); } """ def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]: """使用 pyppeteer 下载 URL(支持 JS 渲染)""" try: from pyppeteer import launch except ImportError: return None, "pyppeteer 库未安装" async def _download(): pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home") chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY") if not chromium_path: os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None browser = None try: browser = await launch( headless=True, executablePath=executable_path, args=CHROME_ARGS ) page = await browser.newPage() await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER) await page.setJavaScriptEnabled(True) await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000}) return await page.content() finally: if browser is not None: try: await browser.close() except Exception: pass try: content = asyncio.run(_download()) if not content or not content.strip(): return None, "下载内容为空" return content, None except Exception as e: return None, f"pyppeteer 下载失败: {str(e)}" def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]: """使用 selenium 下载 URL(支持 JS 渲染)""" try: from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait except ImportError: return None, "selenium 库未安装" driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER") binary_path = os.environ.get("LYXY_CHROMIUM_BINARY") if not driver_path or not os.path.exists(driver_path): return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在" if not binary_path or not os.path.exists(binary_path): return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在" chrome_options = Options() chrome_options.binary_location = binary_path chrome_options.add_argument("--headless=new") for arg in CHROME_ARGS: chrome_options.add_argument(arg) # 隐藏自动化特征 chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option("useAutomationExtension", False) driver = None try: import time service = Service(driver_path) driver = webdriver.Chrome(service=service, options=chrome_options) # 隐藏 webdriver 属性 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": HIDE_AUTOMATION_SCRIPT }) driver.get(url) # 等待页面内容稳定 WebDriverWait(driver, 30).until( lambda d: d.execute_script("return document.readyState") == "complete" ) last_len = 0 stable_count = 0 for _ in range(30): current_len = len(driver.page_source) if current_len == last_len: stable_count += 1 if stable_count >= 2: break else: stable_count = 0 last_len = current_len time.sleep(0.5) content = driver.page_source if not content or not content.strip(): return None, "下载内容为空" return content, None except Exception as e: return None, f"selenium 下载失败: {str(e)}" finally: if driver is not None: try: driver.quit() except Exception: pass def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]: """使用 httpx 下载 URL(轻量级 HTTP 客户端)""" try: import httpx except ImportError: return None, "httpx 库未安装" headers = { "User-Agent": USER_AGENT } try: with httpx.Client(timeout=30.0) as client: response = client.get(url, headers=headers) if response.status_code == 200: content = response.text if not content or not content.strip(): return None, "下载内容为空" return content, None return None, f"HTTP {response.status_code}" except Exception as e: return None, f"httpx 下载失败: {str(e)}" def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]: """使用 urllib 下载 URL(标准库,兜底方案)""" headers = { "User-Agent": USER_AGENT } try: req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req, timeout=30) as response: if response.status == 200: content = response.read().decode("utf-8") if not content or not content.strip(): return None, "下载内容为空" return content, None return None, f"HTTP {response.status}" except Exception as e: return None, f"urllib 下载失败: {str(e)}" def download_html(url: str) -> Tuple[Optional[str], list]: """ 统一的 HTML 下载入口函数,按优先级尝试各下载器。 返回: (content, failures) - content: 成功时返回 HTML 内容,失败时返回 None - failures: 各下载器的失败原因列表 """ failures = [] content = None # 按优先级尝试各下载器 downloaders = [ ("pyppeteer", download_with_pyppeteer), ("selenium", download_with_selenium), ("httpx", download_with_httpx), ("urllib", download_with_urllib), ] for name, func in downloaders: content, error = func(url) if content is not None: return content, failures else: failures.append(f"- {name}: {error}") return None, failures