lyxy-document/scripts/readers/html/downloader/__init__.py

"""HTML 下载器子包，支持多种下载方式按优先级降级"""

from typing import Optional, Tuple, List

from . import pyppeteer
from . import selenium
from . import httpx
from . import urllib


DOWNLOADERS = [
    ("pyppeteer", pyppeteer.download),
    ("selenium", selenium.download),
    ("httpx", httpx.download),
    ("urllib", urllib.download),
]


def download_html(url: str) -> Tuple[Optional[str], List[str]]:
    """
    统一的 HTML 下载入口，按优先级尝试各下载器

    Args:
        url: 目标 URL

    Returns:
        (content, failures): content 成功时为 HTML 内容，所有失败时为 None
                            failures 各下载器的失败原因列表
    """
    failures: List[str] = []

    for name, func in DOWNLOADERS:
        content, error = func(url)
        if content is not None:
            return content, failures
        else:
            failures.append(f"- {name}: {error}")

    return None, failures