refactor: 将 HTML 下载器拆分为子包结构
将 scripts/readers/html/downloader.py (263行) 拆分为 downloader/ 子包,各下载器独立维护: - 创建 downloader/ 子包,包含 __init__.py、common.py 和 4 个下载器模块 - common.py 集中管理公共配置(USER_AGENT、CHROME_ARGS 等) - 各下载器统一接口 download(url: str) -> Tuple[Optional[str], Optional[str]] - 在 __init__.py 定义 DOWNLOADERS 列表显式注册,参考 parser 模式 - 更新 html/__init__.py 导入语句,从 .downloader import download_html - 添加完整的类型注解,提升代码可维护性
This commit is contained in:
@@ -9,7 +9,7 @@ from scripts.utils import is_url
|
|||||||
from scripts.utils import encoding_detection
|
from scripts.utils import encoding_detection
|
||||||
|
|
||||||
from . import cleaner
|
from . import cleaner
|
||||||
from . import downloader
|
from .downloader import download_html
|
||||||
from . import trafilatura
|
from . import trafilatura
|
||||||
from . import domscribe
|
from . import domscribe
|
||||||
from . import markitdown
|
from . import markitdown
|
||||||
@@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
|
|||||||
# 步骤 1: 获取 HTML 内容
|
# 步骤 1: 获取 HTML 内容
|
||||||
if is_url(file_path):
|
if is_url(file_path):
|
||||||
# URL 路径: 下载 HTML
|
# URL 路径: 下载 HTML
|
||||||
html_content, download_failures = downloader.download_html(file_path)
|
html_content, download_failures = download_html(file_path)
|
||||||
all_failures.extend(download_failures)
|
all_failures.extend(download_failures)
|
||||||
if html_content is None:
|
if html_content is None:
|
||||||
return None, all_failures
|
return None, all_failures
|
||||||
|
|||||||
@@ -1,262 +0,0 @@
|
|||||||
"""URL 下载模块,按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import asyncio
|
|
||||||
import tempfile
|
|
||||||
import urllib.request
|
|
||||||
import urllib.error
|
|
||||||
from typing import Optional, Tuple
|
|
||||||
|
|
||||||
|
|
||||||
# 公共配置
|
|
||||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
||||||
WINDOW_SIZE = "1920,1080"
|
|
||||||
LANGUAGE_SETTING = "zh-CN,zh"
|
|
||||||
|
|
||||||
# Chrome 浏览器启动参数(pyppeteer 和 selenium 共用)
|
|
||||||
CHROME_ARGS = [
|
|
||||||
"--no-sandbox",
|
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-gpu",
|
|
||||||
"--disable-software-rasterizer",
|
|
||||||
"--disable-extensions",
|
|
||||||
"--disable-background-networking",
|
|
||||||
"--disable-default-apps",
|
|
||||||
"--disable-sync",
|
|
||||||
"--disable-translate",
|
|
||||||
"--hide-scrollbars",
|
|
||||||
"--metrics-recording-only",
|
|
||||||
"--mute-audio",
|
|
||||||
"--no-first-run",
|
|
||||||
"--safebrowsing-disable-auto-update",
|
|
||||||
"--blink-settings=imagesEnabled=false",
|
|
||||||
"--disable-plugins",
|
|
||||||
"--disable-ipc-flooding-protection",
|
|
||||||
"--disable-renderer-backgrounding",
|
|
||||||
"--disable-background-timer-throttling",
|
|
||||||
"--disable-hang-monitor",
|
|
||||||
"--disable-prompt-on-repost",
|
|
||||||
"--disable-client-side-phishing-detection",
|
|
||||||
"--disable-component-update",
|
|
||||||
"--disable-domain-reliability",
|
|
||||||
"--disable-features=site-per-process",
|
|
||||||
"--disable-features=IsolateOrigins",
|
|
||||||
"--disable-features=VizDisplayCompositor",
|
|
||||||
"--disable-features=WebRTC",
|
|
||||||
f"--window-size={WINDOW_SIZE}",
|
|
||||||
f"--lang={LANGUAGE_SETTING}",
|
|
||||||
f"--user-agent={USER_AGENT}",
|
|
||||||
]
|
|
||||||
|
|
||||||
# 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用)
|
|
||||||
HIDE_AUTOMATION_SCRIPT = """
|
|
||||||
() => {
|
|
||||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
||||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
||||||
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理)
|
|
||||||
HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
|
|
||||||
() => {
|
|
||||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
||||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
||||||
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
|
||||||
const originalQuery = window.navigator.permissions.query;
|
|
||||||
window.navigator.permissions.query = (parameters) => (
|
|
||||||
parameters.name === 'notifications' ?
|
|
||||||
Promise.resolve({ state: Notification.permission }) :
|
|
||||||
originalQuery(parameters)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 pyppeteer 下载 URL(支持 JS 渲染)"""
|
|
||||||
try:
|
|
||||||
from pyppeteer import launch
|
|
||||||
except ImportError:
|
|
||||||
return None, "pyppeteer 库未安装"
|
|
||||||
|
|
||||||
async def _download():
|
|
||||||
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
|
|
||||||
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
|
||||||
if not chromium_path:
|
|
||||||
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
|
|
||||||
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
|
|
||||||
|
|
||||||
browser = None
|
|
||||||
try:
|
|
||||||
browser = await launch(
|
|
||||||
headless=True,
|
|
||||||
executablePath=executable_path,
|
|
||||||
args=CHROME_ARGS
|
|
||||||
)
|
|
||||||
page = await browser.newPage()
|
|
||||||
|
|
||||||
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
|
|
||||||
|
|
||||||
await page.setJavaScriptEnabled(True)
|
|
||||||
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
|
|
||||||
return await page.content()
|
|
||||||
finally:
|
|
||||||
if browser is not None:
|
|
||||||
try:
|
|
||||||
await browser.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
content = asyncio.run(_download())
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"pyppeteer 下载失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 selenium 下载 URL(支持 JS 渲染)"""
|
|
||||||
try:
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
except ImportError:
|
|
||||||
return None, "selenium 库未安装"
|
|
||||||
|
|
||||||
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
|
|
||||||
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
|
||||||
|
|
||||||
if not driver_path or not os.path.exists(driver_path):
|
|
||||||
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
|
|
||||||
if not binary_path or not os.path.exists(binary_path):
|
|
||||||
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
|
|
||||||
|
|
||||||
chrome_options = Options()
|
|
||||||
chrome_options.binary_location = binary_path
|
|
||||||
chrome_options.add_argument("--headless=new")
|
|
||||||
for arg in CHROME_ARGS:
|
|
||||||
chrome_options.add_argument(arg)
|
|
||||||
|
|
||||||
# 隐藏自动化特征
|
|
||||||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
||||||
chrome_options.add_experimental_option("useAutomationExtension", False)
|
|
||||||
|
|
||||||
driver = None
|
|
||||||
try:
|
|
||||||
import time
|
|
||||||
service = Service(driver_path)
|
|
||||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
|
||||||
|
|
||||||
# 隐藏 webdriver 属性
|
|
||||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
|
||||||
"source": HIDE_AUTOMATION_SCRIPT
|
|
||||||
})
|
|
||||||
|
|
||||||
driver.get(url)
|
|
||||||
|
|
||||||
# 等待页面内容稳定
|
|
||||||
WebDriverWait(driver, 30).until(
|
|
||||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
|
||||||
)
|
|
||||||
|
|
||||||
last_len = 0
|
|
||||||
stable_count = 0
|
|
||||||
for _ in range(30):
|
|
||||||
current_len = len(driver.page_source)
|
|
||||||
if current_len == last_len:
|
|
||||||
stable_count += 1
|
|
||||||
if stable_count >= 2:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
stable_count = 0
|
|
||||||
last_len = current_len
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
content = driver.page_source
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"selenium 下载失败: {str(e)}"
|
|
||||||
finally:
|
|
||||||
if driver is not None:
|
|
||||||
try:
|
|
||||||
driver.quit()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 httpx 下载 URL(轻量级 HTTP 客户端)"""
|
|
||||||
try:
|
|
||||||
import httpx
|
|
||||||
except ImportError:
|
|
||||||
return None, "httpx 库未安装"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"User-Agent": USER_AGENT
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
with httpx.Client(timeout=30.0) as client:
|
|
||||||
response = client.get(url, headers=headers)
|
|
||||||
if response.status_code == 200:
|
|
||||||
content = response.text
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
return None, f"HTTP {response.status_code}"
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"httpx 下载失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""使用 urllib 下载 URL(标准库,兜底方案)"""
|
|
||||||
headers = {
|
|
||||||
"User-Agent": USER_AGENT
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
req = urllib.request.Request(url, headers=headers)
|
|
||||||
with urllib.request.urlopen(req, timeout=30) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
content = response.read().decode("utf-8")
|
|
||||||
if not content or not content.strip():
|
|
||||||
return None, "下载内容为空"
|
|
||||||
return content, None
|
|
||||||
return None, f"HTTP {response.status}"
|
|
||||||
except Exception as e:
|
|
||||||
return None, f"urllib 下载失败: {str(e)}"
|
|
||||||
|
|
||||||
|
|
||||||
def download_html(url: str) -> Tuple[Optional[str], list]:
|
|
||||||
"""
|
|
||||||
统一的 HTML 下载入口函数,按优先级尝试各下载器。
|
|
||||||
|
|
||||||
返回: (content, failures)
|
|
||||||
- content: 成功时返回 HTML 内容,失败时返回 None
|
|
||||||
- failures: 各下载器的失败原因列表
|
|
||||||
"""
|
|
||||||
failures = []
|
|
||||||
content = None
|
|
||||||
|
|
||||||
# 按优先级尝试各下载器
|
|
||||||
downloaders = [
|
|
||||||
("pyppeteer", download_with_pyppeteer),
|
|
||||||
("selenium", download_with_selenium),
|
|
||||||
("httpx", download_with_httpx),
|
|
||||||
("urllib", download_with_urllib),
|
|
||||||
]
|
|
||||||
|
|
||||||
for name, func in downloaders:
|
|
||||||
content, error = func(url)
|
|
||||||
if content is not None:
|
|
||||||
return content, failures
|
|
||||||
else:
|
|
||||||
failures.append(f"- {name}: {error}")
|
|
||||||
|
|
||||||
return None, failures
|
|
||||||
39
scripts/readers/html/downloader/__init__.py
Normal file
39
scripts/readers/html/downloader/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""HTML 下载器子包,支持多种下载方式按优先级降级"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple, List
|
||||||
|
|
||||||
|
from . import pyppeteer
|
||||||
|
from . import selenium
|
||||||
|
from . import httpx
|
||||||
|
from . import urllib
|
||||||
|
|
||||||
|
|
||||||
|
DOWNLOADERS = [
|
||||||
|
("pyppeteer", pyppeteer.download),
|
||||||
|
("selenium", selenium.download),
|
||||||
|
("httpx", httpx.download),
|
||||||
|
("urllib", urllib.download),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def download_html(url: str) -> Tuple[Optional[str], List[str]]:
|
||||||
|
"""
|
||||||
|
统一的 HTML 下载入口,按优先级尝试各下载器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, failures): content 成功时为 HTML 内容,所有失败时为 None
|
||||||
|
failures 各下载器的失败原因列表
|
||||||
|
"""
|
||||||
|
failures: List[str] = []
|
||||||
|
|
||||||
|
for name, func in DOWNLOADERS:
|
||||||
|
content, error = func(url)
|
||||||
|
if content is not None:
|
||||||
|
return content, failures
|
||||||
|
else:
|
||||||
|
failures.append(f"- {name}: {error}")
|
||||||
|
|
||||||
|
return None, failures
|
||||||
65
scripts/readers/html/downloader/common.py
Normal file
65
scripts/readers/html/downloader/common.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""下载器公共配置"""
|
||||||
|
|
||||||
|
# 公共配置
|
||||||
|
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
WINDOW_SIZE = "1920,1080"
|
||||||
|
LANGUAGE_SETTING = "zh-CN,zh"
|
||||||
|
|
||||||
|
# Chrome 浏览器启动参数(pyppeteer 和 selenium 共用)
|
||||||
|
CHROME_ARGS = [
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--disable-extensions",
|
||||||
|
"--disable-background-networking",
|
||||||
|
"--disable-default-apps",
|
||||||
|
"--disable-sync",
|
||||||
|
"--disable-translate",
|
||||||
|
"--hide-scrollbars",
|
||||||
|
"--metrics-recording-only",
|
||||||
|
"--mute-audio",
|
||||||
|
"--no-first-run",
|
||||||
|
"--safebrowsing-disable-auto-update",
|
||||||
|
"--blink-settings=imagesEnabled=false",
|
||||||
|
"--disable-plugins",
|
||||||
|
"--disable-ipc-flooding-protection",
|
||||||
|
"--disable-renderer-backgrounding",
|
||||||
|
"--disable-background-timer-throttling",
|
||||||
|
"--disable-hang-monitor",
|
||||||
|
"--disable-prompt-on-repost",
|
||||||
|
"--disable-client-side-phishing-detection",
|
||||||
|
"--disable-component-update",
|
||||||
|
"--disable-domain-reliability",
|
||||||
|
"--disable-features=site-per-process",
|
||||||
|
"--disable-features=IsolateOrigins",
|
||||||
|
"--disable-features=VizDisplayCompositor",
|
||||||
|
"--disable-features=WebRTC",
|
||||||
|
f"--window-size={WINDOW_SIZE}",
|
||||||
|
f"--lang={LANGUAGE_SETTING}",
|
||||||
|
f"--user-agent={USER_AGENT}",
|
||||||
|
]
|
||||||
|
|
||||||
|
# 隐藏自动化特征的脚本(pyppeteer 和 selenium 共用)
|
||||||
|
HIDE_AUTOMATION_SCRIPT = """
|
||||||
|
() => {
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||||
|
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理)
|
||||||
|
HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
|
||||||
|
() => {
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||||
|
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
|
||||||
|
const originalQuery = window.navigator.permissions.query;
|
||||||
|
window.navigator.permissions.query = (parameters) => (
|
||||||
|
parameters.name === 'notifications' ?
|
||||||
|
Promise.resolve({ state: Notification.permission }) :
|
||||||
|
originalQuery(parameters)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
"""
|
||||||
38
scripts/readers/html/downloader/httpx.py
Normal file
38
scripts/readers/html/downloader/httpx.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
"""使用 httpx 下载 URL(轻量级 HTTP 客户端)"""
|
||||||
|
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 httpx 下载 URL(轻量级 HTTP 客户端)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
except ImportError:
|
||||||
|
return None, "httpx 库未安装"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with httpx.Client(timeout=30.0) as client:
|
||||||
|
response = client.get(url, headers=headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
content = response.text
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
return None, f"HTTP {response.status_code}"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"httpx 下载失败: {str(e)}"
|
||||||
65
scripts/readers/html/downloader/pyppeteer.py
Normal file
65
scripts/readers/html/downloader/pyppeteer.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""使用 pyppeteer 下载 URL(支持 JS 渲染)"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import (
|
||||||
|
USER_AGENT,
|
||||||
|
CHROME_ARGS,
|
||||||
|
HIDE_AUTOMATION_SCRIPT_PUPPETEER
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 pyppeteer 下载 URL(支持 JS 渲染)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pyppeteer import launch
|
||||||
|
except ImportError:
|
||||||
|
return None, "pyppeteer 库未安装"
|
||||||
|
|
||||||
|
async def _download():
|
||||||
|
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
|
||||||
|
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
||||||
|
if not chromium_path:
|
||||||
|
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
|
||||||
|
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
|
||||||
|
|
||||||
|
browser = None
|
||||||
|
try:
|
||||||
|
browser = await launch(
|
||||||
|
headless=True,
|
||||||
|
executablePath=executable_path,
|
||||||
|
args=CHROME_ARGS
|
||||||
|
)
|
||||||
|
page = await browser.newPage()
|
||||||
|
|
||||||
|
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
|
||||||
|
|
||||||
|
await page.setJavaScriptEnabled(True)
|
||||||
|
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
|
||||||
|
return await page.content()
|
||||||
|
finally:
|
||||||
|
if browser is not None:
|
||||||
|
try:
|
||||||
|
await browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = asyncio.run(_download())
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"pyppeteer 下载失败: {str(e)}"
|
||||||
92
scripts/readers/html/downloader/selenium.py
Normal file
92
scripts/readers/html/downloader/selenium.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""使用 selenium 下载 URL(支持 JS 渲染)"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import (
|
||||||
|
USER_AGENT,
|
||||||
|
CHROME_ARGS,
|
||||||
|
HIDE_AUTOMATION_SCRIPT
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 selenium 下载 URL(支持 JS 渲染)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
except ImportError:
|
||||||
|
return None, "selenium 库未安装"
|
||||||
|
|
||||||
|
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
|
||||||
|
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
|
||||||
|
|
||||||
|
if not driver_path or not os.path.exists(driver_path):
|
||||||
|
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
|
||||||
|
if not binary_path or not os.path.exists(binary_path):
|
||||||
|
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.binary_location = binary_path
|
||||||
|
chrome_options.add_argument("--headless=new")
|
||||||
|
for arg in CHROME_ARGS:
|
||||||
|
chrome_options.add_argument(arg)
|
||||||
|
|
||||||
|
# 隐藏自动化特征
|
||||||
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
chrome_options.add_experimental_option("useAutomationExtension", False)
|
||||||
|
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
import time
|
||||||
|
service = Service(driver_path)
|
||||||
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
|
||||||
|
# 隐藏 webdriver 属性
|
||||||
|
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||||
|
"source": HIDE_AUTOMATION_SCRIPT
|
||||||
|
})
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# 等待页面内容稳定
|
||||||
|
WebDriverWait(driver, 30).until(
|
||||||
|
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||||
|
)
|
||||||
|
|
||||||
|
last_len = 0
|
||||||
|
stable_count = 0
|
||||||
|
for _ in range(30):
|
||||||
|
current_len = len(driver.page_source)
|
||||||
|
if current_len == last_len:
|
||||||
|
stable_count += 1
|
||||||
|
if stable_count >= 2:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
stable_count = 0
|
||||||
|
last_len = current_len
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
content = driver.page_source
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"selenium 下载失败: {str(e)}"
|
||||||
|
finally:
|
||||||
|
if driver is not None:
|
||||||
|
try:
|
||||||
|
driver.quit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
35
scripts/readers/html/downloader/urllib.py
Normal file
35
scripts/readers/html/downloader/urllib.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""使用 urllib 下载 URL(标准库,兜底方案)"""
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .common import USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
|
def download(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
使用 urllib 下载 URL(标准库,兜底方案)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 目标 URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(content, error): content 成功时为 HTML 内容,失败时为 None
|
||||||
|
error 成功时为 None,失败时为错误信息
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url, headers=headers)
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
content = response.read().decode("utf-8")
|
||||||
|
if not content or not content.strip():
|
||||||
|
return None, "下载内容为空"
|
||||||
|
return content, None
|
||||||
|
return None, f"HTTP {response.status}"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"urllib 下载失败: {str(e)}"
|
||||||
Reference in New Issue
Block a user