From 47038475d4641b3bdaabccf2609119dc5a10afb4 Mon Sep 17 00:00:00 2001 From: lanyuanxiaoyao Date: Mon, 9 Mar 2026 01:13:42 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E5=B0=86=20HTML=20=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E5=99=A8=E6=8B=86=E5=88=86=E4=B8=BA=E5=AD=90=E5=8C=85?= =?UTF-8?q?=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护: - 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等) - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式 - 更新 html/__init__.py 导入语句,从 .downloader import download_html - 添加完整的类型注解,提升代码可维护性 --- scripts/readers/html/__init__.py | 4 +- scripts/readers/html/downloader.py | 262 ------------------- scripts/readers/html/downloader/__init__.py | 39 +++ scripts/readers/html/downloader/common.py | 65 +++++ scripts/readers/html/downloader/httpx.py | 38 +++ scripts/readers/html/downloader/pyppeteer.py | 65 +++++ scripts/readers/html/downloader/selenium.py | 92 +++++++ scripts/readers/html/downloader/urllib.py | 35 +++ 8 files changed, 336 insertions(+), 264 deletions(-) delete mode 100644 scripts/readers/html/downloader.py create mode 100644 scripts/readers/html/downloader/__init__.py create mode 100644 scripts/readers/html/downloader/common.py create mode 100644 scripts/readers/html/downloader/httpx.py create mode 100644 scripts/readers/html/downloader/pyppeteer.py create mode 100644 scripts/readers/html/downloader/selenium.py create mode 100644 scripts/readers/html/downloader/urllib.py diff --git a/scripts/readers/html/__init__.py b/scripts/readers/html/__init__.py index fb43d24..0e8a27b 100644 --- a/scripts/readers/html/__init__.py +++ b/scripts/readers/html/__init__.py @@ -9,7 +9,7 @@ from scripts.utils import is_url from scripts.utils import encoding_detection from . import cleaner -from . import downloader +from .downloader import download_html from . import trafilatura from . import domscribe from . import markitdown @@ -37,7 +37,7 @@ class HtmlReader(BaseReader): # 步骤 1: 获取 HTML 内容 if is_url(file_path): # URL 路径: 下载 HTML - html_content, download_failures = downloader.download_html(file_path) + html_content, download_failures = download_html(file_path) all_failures.extend(download_failures) if html_content is None: return None, all_failures diff --git a/scripts/readers/html/downloader.py b/scripts/readers/html/downloader.py deleted file mode 100644 index f535114..0000000 --- a/scripts/readers/html/downloader.py +++ /dev/null @@ -1,262 +0,0 @@ -"""URL 下载模块,按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。""" - -import os -import asyncio -import tempfile -import urllib.request -import urllib.error -from typing import Optional, Tuple - - -# 公共配置 -USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" -WINDOW_SIZE = "1920,1080" -LANGUAGE_SETTING = "zh-CN,zh" - -# Chrome 浏览器启动参数(pyppeteer 和 selenium 共用) -CHROME_ARGS = [ - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-gpu", - "--disable-software-rasterizer", - "--disable-extensions", - "--disable-background-networking", - "--disable-default-apps", - "--disable-sync", - "--disable-translate", - "--hide-scrollbars", - "--metrics-recording-only", - "--mute-audio", - "--no-first-run", - "--safebrowsing-disable-auto-update", - "--blink-settings=imagesEnabled=false", - "--disable-plugins", - "--disable-ipc-flooding-protection", - "--disable-renderer-backgrounding", - "--disable-background-timer-throttling", - "--disable-hang-monitor", - "--disable-prompt-on-repost", - "--disable-client-side-phishing-detection", - "--disable-component-update", - "--disable-domain-reliability", - "--disable-features=site-per-process", - "--disable-features=IsolateOrigins", - "--disable-features=VizDisplayCompositor", - "--disable-features=WebRTC", - f"--window-size={WINDOW_SIZE}", - f"--lang={LANGUAGE_SETTING}", - f"--user-agent={USER_AGENT}", -] - -# 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用) -HIDE_AUTOMATION_SCRIPT = """ - () => { - Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); - Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); - Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] }); - } -""" - -# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理) -HIDE_AUTOMATION_SCRIPT_PUPPETEER = """ - () => { - Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); - Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); - Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] }); - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - } -""" - - -def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]: - """使用 pyppeteer 下载 URL(支持 JS 渲染)""" - try: - from pyppeteer import launch - except ImportError: - return None, "pyppeteer 库未安装" - - async def _download(): - pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home") - chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY") - if not chromium_path: - os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir - executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None - - browser = None - try: - browser = await launch( - headless=True, - executablePath=executable_path, - args=CHROME_ARGS - ) - page = await browser.newPage() - - await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER) - - await page.setJavaScriptEnabled(True) - await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000}) - return await page.content() - finally: - if browser is not None: - try: - await browser.close() - except Exception: - pass - - try: - content = asyncio.run(_download()) - if not content or not content.strip(): - return None, "下载内容为空" - return content, None - except Exception as e: - return None, f"pyppeteer 下载失败: {str(e)}" - - -def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]: - """使用 selenium 下载 URL(支持 JS 渲染)""" - try: - from selenium import webdriver - from selenium.webdriver.chrome.service import Service - from selenium.webdriver.chrome.options import Options - from selenium.webdriver.support.ui import WebDriverWait - except ImportError: - return None, "selenium 库未安装" - - driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER") - binary_path = os.environ.get("LYXY_CHROMIUM_BINARY") - - if not driver_path or not os.path.exists(driver_path): - return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在" - if not binary_path or not os.path.exists(binary_path): - return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在" - - chrome_options = Options() - chrome_options.binary_location = binary_path - chrome_options.add_argument("--headless=new") - for arg in CHROME_ARGS: - chrome_options.add_argument(arg) - - # 隐藏自动化特征 - chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) - chrome_options.add_experimental_option("useAutomationExtension", False) - - driver = None - try: - import time - service = Service(driver_path) - driver = webdriver.Chrome(service=service, options=chrome_options) - - # 隐藏 webdriver 属性 - driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { - "source": HIDE_AUTOMATION_SCRIPT - }) - - driver.get(url) - - # 等待页面内容稳定 - WebDriverWait(driver, 30).until( - lambda d: d.execute_script("return document.readyState") == "complete" - ) - - last_len = 0 - stable_count = 0 - for _ in range(30): - current_len = len(driver.page_source) - if current_len == last_len: - stable_count += 1 - if stable_count >= 2: - break - else: - stable_count = 0 - last_len = current_len - time.sleep(0.5) - - content = driver.page_source - if not content or not content.strip(): - return None, "下载内容为空" - return content, None - except Exception as e: - return None, f"selenium 下载失败: {str(e)}" - finally: - if driver is not None: - try: - driver.quit() - except Exception: - pass - - -def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]: - """使用 httpx 下载 URL(轻量级 HTTP 客户端)""" - try: - import httpx - except ImportError: - return None, "httpx 库未安装" - - headers = { - "User-Agent": USER_AGENT - } - - try: - with httpx.Client(timeout=30.0) as client: - response = client.get(url, headers=headers) - if response.status_code == 200: - content = response.text - if not content or not content.strip(): - return None, "下载内容为空" - return content, None - return None, f"HTTP {response.status_code}" - except Exception as e: - return None, f"httpx 下载失败: {str(e)}" - - -def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]: - """使用 urllib 下载 URL(标准库,兜底方案)""" - headers = { - "User-Agent": USER_AGENT - } - - try: - req = urllib.request.Request(url, headers=headers) - with urllib.request.urlopen(req, timeout=30) as response: - if response.status == 200: - content = response.read().decode("utf-8") - if not content or not content.strip(): - return None, "下载内容为空" - return content, None - return None, f"HTTP {response.status}" - except Exception as e: - return None, f"urllib 下载失败: {str(e)}" - - -def download_html(url: str) -> Tuple[Optional[str], list]: - """ - 统一的 HTML 下载入口函数,按优先级尝试各下载器。 - - 返回: (content, failures) - - content: 成功时返回 HTML 内容,失败时返回 None - - failures: 各下载器的失败原因列表 - """ - failures = [] - content = None - - # 按优先级尝试各下载器 - downloaders = [ - ("pyppeteer", download_with_pyppeteer), - ("selenium", download_with_selenium), - ("httpx", download_with_httpx), - ("urllib", download_with_urllib), - ] - - for name, func in downloaders: - content, error = func(url) - if content is not None: - return content, failures - else: - failures.append(f"- {name}: {error}") - - return None, failures diff --git a/scripts/readers/html/downloader/__init__.py b/scripts/readers/html/downloader/__init__.py new file mode 100644 index 0000000..2d4c114 --- /dev/null +++ b/scripts/readers/html/downloader/__init__.py @@ -0,0 +1,39 @@ +"""HTML 下载器子包,支持多种下载方式按优先级降级""" + +from typing import Optional, Tuple, List + +from . import pyppeteer +from . import selenium +from . import httpx +from . import urllib + + +DOWNLOADERS = [ + ("pyppeteer", pyppeteer.download), + ("selenium", selenium.download), + ("httpx", httpx.download), + ("urllib", urllib.download), +] + + +def download_html(url: str) -> Tuple[Optional[str], List[str]]: + """ + 统一的 HTML 下载入口,按优先级尝试各下载器 + + Args: + url: 目标 URL + + Returns: + (content, failures): content 成功时为 HTML 内容,所有失败时为 None + failures 各下载器的失败原因列表 + """ + failures: List[str] = [] + + for name, func in DOWNLOADERS: + content, error = func(url) + if content is not None: + return content, failures + else: + failures.append(f"- {name}: {error}") + + return None, failures diff --git a/scripts/readers/html/downloader/common.py b/scripts/readers/html/downloader/common.py new file mode 100644 index 0000000..3957411 --- /dev/null +++ b/scripts/readers/html/downloader/common.py @@ -0,0 +1,65 @@ +"""下载器公共配置""" + +# 公共配置 +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" +WINDOW_SIZE = "1920,1080" +LANGUAGE_SETTING = "zh-CN,zh" + +# Chrome 浏览器启动参数(pyppeteer 和 selenium 共用) +CHROME_ARGS = [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-gpu", + "--disable-software-rasterizer", + "--disable-extensions", + "--disable-background-networking", + "--disable-default-apps", + "--disable-sync", + "--disable-translate", + "--hide-scrollbars", + "--metrics-recording-only", + "--mute-audio", + "--no-first-run", + "--safebrowsing-disable-auto-update", + "--blink-settings=imagesEnabled=false", + "--disable-plugins", + "--disable-ipc-flooding-protection", + "--disable-renderer-backgrounding", + "--disable-background-timer-throttling", + "--disable-hang-monitor", + "--disable-prompt-on-repost", + "--disable-client-side-phishing-detection", + "--disable-component-update", + "--disable-domain-reliability", + "--disable-features=site-per-process", + "--disable-features=IsolateOrigins", + "--disable-features=VizDisplayCompositor", + "--disable-features=WebRTC", + f"--window-size={WINDOW_SIZE}", + f"--lang={LANGUAGE_SETTING}", + f"--user-agent={USER_AGENT}", +] + +# 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用) +HIDE_AUTOMATION_SCRIPT = """ + () => { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); + Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] }); + } +""" + +# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理) +HIDE_AUTOMATION_SCRIPT_PUPPETEER = """ + () => { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); + Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] }); + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + } +""" diff --git a/scripts/readers/html/downloader/httpx.py b/scripts/readers/html/downloader/httpx.py new file mode 100644 index 0000000..a18c29d --- /dev/null +++ b/scripts/readers/html/downloader/httpx.py @@ -0,0 +1,38 @@ +"""使用 httpx 下载 URL(轻量级 HTTP 客户端)""" + +from typing import Optional, Tuple + +from .common import USER_AGENT + + +def download(url: str) -> Tuple[Optional[str], Optional[str]]: + """ + 使用 httpx 下载 URL(轻量级 HTTP 客户端) + + Args: + url: 目标 URL + + Returns: + (content, error): content 成功时为 HTML 内容,失败时为 None + error 成功时为 None,失败时为错误信息 + """ + try: + import httpx + except ImportError: + return None, "httpx 库未安装" + + headers = { + "User-Agent": USER_AGENT + } + + try: + with httpx.Client(timeout=30.0) as client: + response = client.get(url, headers=headers) + if response.status_code == 200: + content = response.text + if not content or not content.strip(): + return None, "下载内容为空" + return content, None + return None, f"HTTP {response.status_code}" + except Exception as e: + return None, f"httpx 下载失败: {str(e)}" diff --git a/scripts/readers/html/downloader/pyppeteer.py b/scripts/readers/html/downloader/pyppeteer.py new file mode 100644 index 0000000..c4ae4ea --- /dev/null +++ b/scripts/readers/html/downloader/pyppeteer.py @@ -0,0 +1,65 @@ +"""使用 pyppeteer 下载 URL(支持 JS 渲染)""" + +import os +import asyncio +import tempfile +from typing import Optional, Tuple + +from .common import ( + USER_AGENT, + CHROME_ARGS, + HIDE_AUTOMATION_SCRIPT_PUPPETEER +) + + +def download(url: str) -> Tuple[Optional[str], Optional[str]]: + """ + 使用 pyppeteer 下载 URL(支持 JS 渲染) + + Args: + url: 目标 URL + + Returns: + (content, error): content 成功时为 HTML 内容,失败时为 None + error 成功时为 None,失败时为错误信息 + """ + try: + from pyppeteer import launch + except ImportError: + return None, "pyppeteer 库未安装" + + async def _download(): + pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home") + chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY") + if not chromium_path: + os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir + executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None + + browser = None + try: + browser = await launch( + headless=True, + executablePath=executable_path, + args=CHROME_ARGS + ) + page = await browser.newPage() + + await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER) + + await page.setJavaScriptEnabled(True) + await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000}) + return await page.content() + finally: + if browser is not None: + try: + await browser.close() + except Exception: + pass + + try: + content = asyncio.run(_download()) + if not content or not content.strip(): + return None, "下载内容为空" + return content, None + except Exception as e: + return None, f"pyppeteer 下载失败: {str(e)}" diff --git a/scripts/readers/html/downloader/selenium.py b/scripts/readers/html/downloader/selenium.py new file mode 100644 index 0000000..b61347a --- /dev/null +++ b/scripts/readers/html/downloader/selenium.py @@ -0,0 +1,92 @@ +"""使用 selenium 下载 URL(支持 JS 渲染)""" + +import os +from typing import Optional, Tuple + +from .common import ( + USER_AGENT, + CHROME_ARGS, + HIDE_AUTOMATION_SCRIPT +) + + +def download(url: str) -> Tuple[Optional[str], Optional[str]]: + """ + 使用 selenium 下载 URL(支持 JS 渲染) + + Args: + url: 目标 URL + + Returns: + (content, error): content 成功时为 HTML 内容,失败时为 None + error 成功时为 None,失败时为错误信息 + """ + try: + from selenium import webdriver + from selenium.webdriver.chrome.service import Service + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.support.ui import WebDriverWait + except ImportError: + return None, "selenium 库未安装" + + driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER") + binary_path = os.environ.get("LYXY_CHROMIUM_BINARY") + + if not driver_path or not os.path.exists(driver_path): + return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在" + if not binary_path or not os.path.exists(binary_path): + return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在" + + chrome_options = Options() + chrome_options.binary_location = binary_path + chrome_options.add_argument("--headless=new") + for arg in CHROME_ARGS: + chrome_options.add_argument(arg) + + # 隐藏自动化特征 + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option("useAutomationExtension", False) + + driver = None + try: + import time + service = Service(driver_path) + driver = webdriver.Chrome(service=service, options=chrome_options) + + # 隐藏 webdriver 属性 + driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": HIDE_AUTOMATION_SCRIPT + }) + + driver.get(url) + + # 等待页面内容稳定 + WebDriverWait(driver, 30).until( + lambda d: d.execute_script("return document.readyState") == "complete" + ) + + last_len = 0 + stable_count = 0 + for _ in range(30): + current_len = len(driver.page_source) + if current_len == last_len: + stable_count += 1 + if stable_count >= 2: + break + else: + stable_count = 0 + last_len = current_len + time.sleep(0.5) + + content = driver.page_source + if not content or not content.strip(): + return None, "下载内容为空" + return content, None + except Exception as e: + return None, f"selenium 下载失败: {str(e)}" + finally: + if driver is not None: + try: + driver.quit() + except Exception: + pass diff --git a/scripts/readers/html/downloader/urllib.py b/scripts/readers/html/downloader/urllib.py new file mode 100644 index 0000000..25bc18e --- /dev/null +++ b/scripts/readers/html/downloader/urllib.py @@ -0,0 +1,35 @@ +"""使用 urllib 下载 URL(标准库,兜底方案)""" + +import urllib.request +import urllib.error +from typing import Optional, Tuple + +from .common import USER_AGENT + + +def download(url: str) -> Tuple[Optional[str], Optional[str]]: + """ + 使用 urllib 下载 URL(标准库,兜底方案) + + Args: + url: 目标 URL + + Returns: + (content, error): content 成功时为 HTML 内容,失败时为 None + error 成功时为 None,失败时为错误信息 + """ + headers = { + "User-Agent": USER_AGENT + } + + try: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=30) as response: + if response.status == 200: + content = response.read().decode("utf-8") + if not content or not content.strip(): + return None, "下载内容为空" + return content, None + return None, f"HTTP {response.status}" + except Exception as e: + return None, f"urllib 下载失败: {str(e)}"