feat: 统一文档解析器项目 - 迁移 lyxy-reader-office 和 lyxy-reader-html

## 功能特性 - 建立统一的项目结构，包含 core/、readers/、utils/、tests/ 模块 - 迁移 lyxy-reader-office 的所有解析器（docx、xlsx、pptx、pdf） - 迁移 lyxy-reader-html 的所有解析器（html、url 下载） - 统一 CLI 入口为 lyxy_document_reader.py - 统一 Markdown 后处理逻辑 - 按文件类型组织 readers，每个解析器独立文件 - 依赖分组按文件类型细分（docx、xlsx、pptx、pdf、html、http） - PDF OCR 解析器优先，无参数控制 - 使用 logging 模块替代简单 print - 设计完整的单元测试结构 - 重写项目文档 ## 新增目录/文件 - core/ - 核心模块（异常体系、Markdown 工具、解析调度器） - readers/ - 格式阅读器（base.py + docx/xlsx/pptx/pdf/html） - utils/ - 工具函数（文件类型检测） - tests/ - 测试（conftest.py + test_core/ + test_readers/ + test_utils/） - lyxy_document_reader.py - 统一 CLI 入口 ## 依赖分组 - docx - DOCX 文档解析支持 - xlsx - XLSX 文档解析支持 - pptx - PPTX 文档解析支持 - pdf - PDF 文档解析支持（含 OCR） - html - HTML/URL 解析支持 - http - HTTP/URL 下载支持 - office - Office 格式组合（docx/xlsx/pptx/pdf） - web - Web 格式组合（html/http） - full - 完整功能 - dev - 开发依赖
2026-03-08 13:46:37 +08:00
parent eb8973495e
commit 833018d451
66 changed files with 4054 additions and 0 deletions
--- a/readers/html/downloader.py
+++ b/readers/html/downloader.py
@@ -0,0 +1,262 @@
+"""URL 下载模块，按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。"""
+
+import os
+import asyncio
+import tempfile
+import urllib.request
+import urllib.error
+from typing import Optional, Tuple
+
+
+# 公共配置
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+WINDOW_SIZE = "1920,1080"
+LANGUAGE_SETTING = "zh-CN,zh"
+
+# Chrome 浏览器启动参数（pyppeteer 和 selenium 共用）
+CHROME_ARGS = [
+    "--no-sandbox",
+    "--disable-dev-shm-usage",
+    "--disable-gpu",
+    "--disable-software-rasterizer",
+    "--disable-extensions",
+    "--disable-background-networking",
+    "--disable-default-apps",
+    "--disable-sync",
+    "--disable-translate",
+    "--hide-scrollbars",
+    "--metrics-recording-only",
+    "--mute-audio",
+    "--no-first-run",
+    "--safebrowsing-disable-auto-update",
+    "--blink-settings=imagesEnabled=false",
+    "--disable-plugins",
+    "--disable-ipc-flooding-protection",
+    "--disable-renderer-backgrounding",
+    "--disable-background-timer-throttling",
+    "--disable-hang-monitor",
+    "--disable-prompt-on-repost",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-update",
+    "--disable-domain-reliability",
+    "--disable-features=site-per-process",
+    "--disable-features=IsolateOrigins",
+    "--disable-features=VizDisplayCompositor",
+    "--disable-features=WebRTC",
+    f"--window-size={WINDOW_SIZE}",
+    f"--lang={LANGUAGE_SETTING}",
+    f"--user-agent={USER_AGENT}",
+]
+
+# 隐藏自动化特征的脚本（pyppeteer 和 selenium 共用）
+HIDE_AUTOMATION_SCRIPT = """
+    () => {
+        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
+        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
+    }
+"""
+
+# pyppeteer 额外的隐藏自动化脚本（包含 notifications 处理）
+HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
+    () => {
+        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
+        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
+        const originalQuery = window.navigator.permissions.query;
+        window.navigator.permissions.query = (parameters) => (
+            parameters.name === 'notifications' ?
+                Promise.resolve({ state: Notification.permission }) :
+                originalQuery(parameters)
+        );
+    }
+"""
+
+
+def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 pyppeteer 下载 URL（支持 JS 渲染）"""
+    try:
+        from pyppeteer import launch
+    except ImportError:
+        return None, "pyppeteer 库未安装"
+
+    async def _download():
+        pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
+        chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+        if not chromium_path:
+            os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
+        executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
+
+        browser = None
+        try:
+            browser = await launch(
+                headless=True,
+                executablePath=executable_path,
+                args=CHROME_ARGS
+            )
+            page = await browser.newPage()
+
+            await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
+
+            await page.setJavaScriptEnabled(True)
+            await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
+            return await page.content()
+        finally:
+            if browser is not None:
+                try:
+                    await browser.close()
+                except Exception:
+                    pass
+
+    try:
+        content = asyncio.run(_download())
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"pyppeteer 下载失败: {str(e)}"
+
+
+def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 selenium 下载 URL（支持 JS 渲染）"""
+    try:
+        from selenium import webdriver
+        from selenium.webdriver.chrome.service import Service
+        from selenium.webdriver.chrome.options import Options
+        from selenium.webdriver.support.ui import WebDriverWait
+    except ImportError:
+        return None, "selenium 库未安装"
+
+    driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
+    binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+
+    if not driver_path or not os.path.exists(driver_path):
+        return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
+    if not binary_path or not os.path.exists(binary_path):
+        return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
+
+    chrome_options = Options()
+    chrome_options.binary_location = binary_path
+    chrome_options.add_argument("--headless=new")
+    for arg in CHROME_ARGS:
+        chrome_options.add_argument(arg)
+
+    # 隐藏自动化特征
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option("useAutomationExtension", False)
+
+    driver = None
+    try:
+        import time
+        service = Service(driver_path)
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+
+        # 隐藏 webdriver 属性
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+            "source": HIDE_AUTOMATION_SCRIPT
+        })
+
+        driver.get(url)
+
+        # 等待页面内容稳定
+        WebDriverWait(driver, 30).until(
+            lambda d: d.execute_script("return document.readyState") == "complete"
+        )
+
+        last_len = 0
+        stable_count = 0
+        for _ in range(30):
+            current_len = len(driver.page_source)
+            if current_len == last_len:
+                stable_count += 1
+                if stable_count >= 2:
+                    break
+            else:
+                stable_count = 0
+                last_len = current_len
+            time.sleep(0.5)
+
+        content = driver.page_source
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"selenium 下载失败: {str(e)}"
+    finally:
+        if driver is not None:
+            try:
+                driver.quit()
+            except Exception:
+                pass
+
+
+def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 httpx 下载 URL（轻量级 HTTP 客户端）"""
+    try:
+        import httpx
+    except ImportError:
+        return None, "httpx 库未安装"
+
+    headers = {
+        "User-Agent": USER_AGENT
+    }
+
+    try:
+        with httpx.Client(timeout=30.0) as client:
+            response = client.get(url, headers=headers)
+            if response.status_code == 200:
+                content = response.text
+                if not content or not content.strip():
+                    return None, "下载内容为空"
+                return content, None
+            return None, f"HTTP {response.status_code}"
+    except Exception as e:
+        return None, f"httpx 下载失败: {str(e)}"
+
+
+def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 urllib 下载 URL（标准库，兜底方案）"""
+    headers = {
+        "User-Agent": USER_AGENT
+    }
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as response:
+            if response.status == 200:
+                content = response.read().decode("utf-8")
+                if not content or not content.strip():
+                    return None, "下载内容为空"
+                return content, None
+            return None, f"HTTP {response.status}"
+    except Exception as e:
+        return None, f"urllib 下载失败: {str(e)}"
+
+
+def download_html(url: str) -> Tuple[Optional[str], list]:
+    """
+    统一的 HTML 下载入口函数，按优先级尝试各下载器。
+
+    返回: (content, failures)
+    - content: 成功时返回 HTML 内容，失败时返回 None
+    - failures: 各下载器的失败原因列表
+    """
+    failures = []
+    content = None
+
+    # 按优先级尝试各下载器
+    downloaders = [
+        ("pyppeteer", download_with_pyppeteer),
+        ("selenium", download_with_selenium),
+        ("httpx", download_with_httpx),
+        ("urllib", download_with_urllib),
+    ]
+
+    for name, func in downloaders:
+        content, error = func(url)
+        if content is not None:
+            return content, failures
+        else:
+            failures.append(f"- {name}: {error}")
+
+    return None, failures