lyxy-document/scripts/readers/html/downloader/urllib.py

"""使用 urllib 下载 URL（标准库，兜底方案）"""

import urllib.request
import urllib.error
from typing import Optional, Tuple

from .common import USER_AGENT


def download(url: str) -> Tuple[Optional[str], Optional[str]]:
    """
    使用 urllib 下载 URL（标准库，兜底方案）

    Args:
        url: 目标 URL

    Returns:
        (content, error): content 成功时为 HTML 内容，失败时为 None
                         error 成功时为 None，失败时为错误信息
    """
    headers = {
        "User-Agent": USER_AGENT
    }

    try:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=30) as response:
            if response.status == 200:
                content = response.read().decode("utf-8")
                if not content or not content.strip():
                    return None, "下载内容为空"
                return content, None
            return None, f"HTTP {response.status}"
    except Exception as e:
        return None, f"urllib 下载失败: {str(e)}"