lyxy-document/scripts/readers/html/downloader/selenium.py

"""使用 selenium 下载 URL（支持 JS 渲染）"""

import os
from typing import Optional, Tuple

from .common import (
    USER_AGENT,
    CHROME_ARGS,
    HIDE_AUTOMATION_SCRIPT
)


def download(url: str) -> Tuple[Optional[str], Optional[str]]:
    """
    使用 selenium 下载 URL（支持 JS 渲染）

    Args:
        url: 目标 URL

    Returns:
        (content, error): content 成功时为 HTML 内容，失败时为 None
                         error 成功时为 None，失败时为错误信息
    """
    try:
        from selenium import webdriver
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.chrome.options import Options
        from selenium.webdriver.support.ui import WebDriverWait
    except ImportError:
        return None, "selenium 库未安装"

    driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
    binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")

    if not driver_path or not os.path.exists(driver_path):
        return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
    if not binary_path or not os.path.exists(binary_path):
        return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"

    chrome_options = Options()
    chrome_options.binary_location = binary_path
    chrome_options.add_argument("--headless=new")
    for arg in CHROME_ARGS:
        chrome_options.add_argument(arg)

    # 隐藏自动化特征
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    driver = None
    try:
        import time
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # 隐藏 webdriver 属性
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": HIDE_AUTOMATION_SCRIPT
        })

        driver.get(url)

        # 等待页面内容稳定
        WebDriverWait(driver, 30).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        last_len = 0
        stable_count = 0
        for _ in range(30):
            current_len = len(driver.page_source)
            if current_len == last_len:
                stable_count += 1
                if stable_count >= 2:
                    break
            else:
                stable_count = 0
                last_len = current_len
            time.sleep(0.5)

        content = driver.page_source
        if not content or not content.strip():
            return None, "下载内容为空"
        return content, None
    except Exception as e:
        return None, f"selenium 下载失败: {str(e)}"
    finally:
        if driver is not None:
            try:
                driver.quit()
            except Exception:
                pass