feat: 统一文档解析器项目 - 迁移 lyxy-reader-office 和 lyxy-reader-html

## 功能特性 - 建立统一的项目结构，包含 core/、readers/、utils/、tests/ 模块 - 迁移 lyxy-reader-office 的所有解析器（docx、xlsx、pptx、pdf） - 迁移 lyxy-reader-html 的所有解析器（html、url 下载） - 统一 CLI 入口为 lyxy_document_reader.py - 统一 Markdown 后处理逻辑 - 按文件类型组织 readers，每个解析器独立文件 - 依赖分组按文件类型细分（docx、xlsx、pptx、pdf、html、http） - PDF OCR 解析器优先，无参数控制 - 使用 logging 模块替代简单 print - 设计完整的单元测试结构 - 重写项目文档 ## 新增目录/文件 - core/ - 核心模块（异常体系、Markdown 工具、解析调度器） - readers/ - 格式阅读器（base.py + docx/xlsx/pptx/pdf/html） - utils/ - 工具函数（文件类型检测） - tests/ - 测试（conftest.py + test_core/ + test_readers/ + test_utils/） - lyxy_document_reader.py - 统一 CLI 入口 ## 依赖分组 - docx - DOCX 文档解析支持 - xlsx - XLSX 文档解析支持 - pptx - PPTX 文档解析支持 - pdf - PDF 文档解析支持（含 OCR） - html - HTML/URL 解析支持 - http - HTTP/URL 下载支持 - office - Office 格式组合（docx/xlsx/pptx/pdf） - web - Web 格式组合（html/http） - full - 完整功能 - dev - 开发依赖
2026-03-08 13:46:37 +08:00
parent eb8973495e
commit 833018d451
66 changed files with 4054 additions and 0 deletions
--- a/readers/html/init.py
+++ b/readers/html/init.py
@@ -0,0 +1,89 @@
+"""HTML/URL 文件阅读器，支持多种解析方法。"""
+
+import os
+from typing import List, Optional, Tuple
+
+from readers.base import BaseReader
+from utils import is_html_file, is_url
+
+from . import cleaner
+from . import downloader
+from . import trafilatura
+from . import domscribe
+from . import markitdown
+from . import html2text
+
+
+PARSERS = [
+    ("trafilatura", lambda c, t: trafilatura.parse(c)),
+    ("domscribe", lambda c, t: domscribe.parse(c)),
+    ("MarkItDown", lambda c, t: markitdown.parse(c, t)),
+    ("html2text", lambda c, t: html2text.parse(c)),
+]
+
+
+class HtmlReader(BaseReader):
+    """HTML/URL 文件阅读器"""
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        return [".html", ".htm"]
+
+    def supports(self, file_path: str) -> bool:
+        return is_url(file_path) or is_html_file(file_path)
+
+    def download_and_parse(self, url: str) -> Tuple[Optional[str], List[str]]:
+        """下载 URL 并解析"""
+        all_failures = []
+
+        # 下载 HTML
+        html_content, download_failures = downloader.download_html(url)
+        all_failures.extend(download_failures)
+
+        if html_content is None:
+            return None, all_failures
+
+        # 清理 HTML
+        html_content = cleaner.clean_html_content(html_content)
+
+        # 解析 HTML
+        content, parse_failures = self._parse_html_content(html_content, None)
+        all_failures.extend(parse_failures)
+
+        return content, all_failures
+
+    def _parse_html_content(self, html_content: str, temp_file_path: Optional[str]) -> Tuple[Optional[str], List[str]]:
+        """解析 HTML 内容"""
+        failures = []
+        content = None
+
+        for parser_name, parser_func in PARSERS:
+            content, error = parser_func(html_content, temp_file_path)
+            if content is not None:
+                return content, failures
+            else:
+                failures.append(f"- {parser_name}: {error}")
+
+        return None, failures
+
+    def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
+        all_failures = []
+
+        if is_url(file_path):
+            return self.download_and_parse(file_path)
+
+        # 读取 HTML 文件
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+        except Exception as e:
+            return None, [f"- 读取文件失败: {str(e)}"]
+
+        # 清理 HTML
+        html_content = cleaner.clean_html_content(html_content)
+
+        # 解析 HTML
+        content, parse_failures = self._parse_html_content(html_content, file_path)
+        all_failures.extend(parse_failures)
+
+        return content, all_failures
--- a/readers/html/cleaner.py
+++ b/readers/html/cleaner.py
@@ -0,0 +1,69 @@
+"""HTML 清理模块，用于清理 HTML 内容中的敏感信息。"""
+
+import re
+from bs4 import BeautifulSoup
+
+
+def clean_html_content(html_content: str) -> str:
+    """清理 HTML 内容，移除 script/style/link/svg 标签和 URL 属性。"""
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # Remove all script tags
+    for script in soup.find_all("script"):
+        script.decompose()
+
+    # Remove all style tags
+    for style in soup.find_all("style"):
+        style.decompose()
+
+    # Remove all svg tags
+    for svg in soup.find_all("svg"):
+        svg.decompose()
+
+    # Remove all link tags
+    for link in soup.find_all("link"):
+        link.decompose()
+
+    # Remove URLs from href and src attributes
+    for tag in soup.find_all(True):
+        if "href" in tag.attrs:
+            del tag["href"]
+        if "src" in tag.attrs:
+            del tag["src"]
+        if "srcset" in tag.attrs:
+            del tag["srcset"]
+        if "action" in tag.attrs:
+            del tag["action"]
+        data_attrs = [
+            attr
+            for attr in tag.attrs
+            if attr.startswith("data-") and "src" in attr.lower()
+        ]
+        for attr in data_attrs:
+            del tag[attr]
+
+    # Remove all style attributes from all tags
+    for tag in soup.find_all(True):
+        if "style" in tag.attrs:
+            del tag["style"]
+
+    # Remove data-href attributes
+    for tag in soup.find_all(True):
+        if "data-href" in tag.attrs:
+            del tag["data-href"]
+
+    # Remove URLs from title attributes
+    for tag in soup.find_all(True):
+        if "title" in tag.attrs:
+            title = tag["title"]
+            cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
+            tag["title"] = cleaned_title
+
+    # Remove class attributes that contain URL-like patterns
+    for tag in soup.find_all(True):
+        if "class" in tag.attrs:
+            classes = tag["class"]
+            cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
+            tag["class"] = cleaned_classes
+
+    return str(soup)
--- a/readers/html/domscribe.py
+++ b/readers/html/domscribe.py
@@ -0,0 +1,22 @@
+"""使用 domscribe 解析 HTML"""
+
+from typing import Optional, Tuple
+
+
+def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 domscribe 解析 HTML"""
+    try:
+        from domscribe import html_to_markdown
+    except ImportError:
+        return None, "domscribe 库未安装"
+
+    try:
+        options = {
+            'extract_main_content': True,
+        }
+        markdown_content = html_to_markdown(html_content, options)
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"domscribe 解析失败: {str(e)}"
--- a/readers/html/downloader.py
+++ b/readers/html/downloader.py
@@ -0,0 +1,262 @@
+"""URL 下载模块，按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。"""
+
+import os
+import asyncio
+import tempfile
+import urllib.request
+import urllib.error
+from typing import Optional, Tuple
+
+
+# 公共配置
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+WINDOW_SIZE = "1920,1080"
+LANGUAGE_SETTING = "zh-CN,zh"
+
+# Chrome 浏览器启动参数（pyppeteer 和 selenium 共用）
+CHROME_ARGS = [
+    "--no-sandbox",
+    "--disable-dev-shm-usage",
+    "--disable-gpu",
+    "--disable-software-rasterizer",
+    "--disable-extensions",
+    "--disable-background-networking",
+    "--disable-default-apps",
+    "--disable-sync",
+    "--disable-translate",
+    "--hide-scrollbars",
+    "--metrics-recording-only",
+    "--mute-audio",
+    "--no-first-run",
+    "--safebrowsing-disable-auto-update",
+    "--blink-settings=imagesEnabled=false",
+    "--disable-plugins",
+    "--disable-ipc-flooding-protection",
+    "--disable-renderer-backgrounding",
+    "--disable-background-timer-throttling",
+    "--disable-hang-monitor",
+    "--disable-prompt-on-repost",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-update",
+    "--disable-domain-reliability",
+    "--disable-features=site-per-process",
+    "--disable-features=IsolateOrigins",
+    "--disable-features=VizDisplayCompositor",
+    "--disable-features=WebRTC",
+    f"--window-size={WINDOW_SIZE}",
+    f"--lang={LANGUAGE_SETTING}",
+    f"--user-agent={USER_AGENT}",
+]
+
+# 隐藏自动化特征的脚本（pyppeteer 和 selenium 共用）
+HIDE_AUTOMATION_SCRIPT = """
+    () => {
+        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
+        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
+    }
+"""
+
+# pyppeteer 额外的隐藏自动化脚本（包含 notifications 处理）
+HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
+    () => {
+        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+        Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
+        Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
+        const originalQuery = window.navigator.permissions.query;
+        window.navigator.permissions.query = (parameters) => (
+            parameters.name === 'notifications' ?
+                Promise.resolve({ state: Notification.permission }) :
+                originalQuery(parameters)
+        );
+    }
+"""
+
+
+def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 pyppeteer 下载 URL（支持 JS 渲染）"""
+    try:
+        from pyppeteer import launch
+    except ImportError:
+        return None, "pyppeteer 库未安装"
+
+    async def _download():
+        pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
+        chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+        if not chromium_path:
+            os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
+        executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
+
+        browser = None
+        try:
+            browser = await launch(
+                headless=True,
+                executablePath=executable_path,
+                args=CHROME_ARGS
+            )
+            page = await browser.newPage()
+
+            await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
+
+            await page.setJavaScriptEnabled(True)
+            await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
+            return await page.content()
+        finally:
+            if browser is not None:
+                try:
+                    await browser.close()
+                except Exception:
+                    pass
+
+    try:
+        content = asyncio.run(_download())
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"pyppeteer 下载失败: {str(e)}"
+
+
+def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 selenium 下载 URL（支持 JS 渲染）"""
+    try:
+        from selenium import webdriver
+        from selenium.webdriver.chrome.service import Service
+        from selenium.webdriver.chrome.options import Options
+        from selenium.webdriver.support.ui import WebDriverWait
+    except ImportError:
+        return None, "selenium 库未安装"
+
+    driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
+    binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
+
+    if not driver_path or not os.path.exists(driver_path):
+        return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
+    if not binary_path or not os.path.exists(binary_path):
+        return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
+
+    chrome_options = Options()
+    chrome_options.binary_location = binary_path
+    chrome_options.add_argument("--headless=new")
+    for arg in CHROME_ARGS:
+        chrome_options.add_argument(arg)
+
+    # 隐藏自动化特征
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option("useAutomationExtension", False)
+
+    driver = None
+    try:
+        import time
+        service = Service(driver_path)
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+
+        # 隐藏 webdriver 属性
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+            "source": HIDE_AUTOMATION_SCRIPT
+        })
+
+        driver.get(url)
+
+        # 等待页面内容稳定
+        WebDriverWait(driver, 30).until(
+            lambda d: d.execute_script("return document.readyState") == "complete"
+        )
+
+        last_len = 0
+        stable_count = 0
+        for _ in range(30):
+            current_len = len(driver.page_source)
+            if current_len == last_len:
+                stable_count += 1
+                if stable_count >= 2:
+                    break
+            else:
+                stable_count = 0
+                last_len = current_len
+            time.sleep(0.5)
+
+        content = driver.page_source
+        if not content or not content.strip():
+            return None, "下载内容为空"
+        return content, None
+    except Exception as e:
+        return None, f"selenium 下载失败: {str(e)}"
+    finally:
+        if driver is not None:
+            try:
+                driver.quit()
+            except Exception:
+                pass
+
+
+def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 httpx 下载 URL（轻量级 HTTP 客户端）"""
+    try:
+        import httpx
+    except ImportError:
+        return None, "httpx 库未安装"
+
+    headers = {
+        "User-Agent": USER_AGENT
+    }
+
+    try:
+        with httpx.Client(timeout=30.0) as client:
+            response = client.get(url, headers=headers)
+            if response.status_code == 200:
+                content = response.text
+                if not content or not content.strip():
+                    return None, "下载内容为空"
+                return content, None
+            return None, f"HTTP {response.status_code}"
+    except Exception as e:
+        return None, f"httpx 下载失败: {str(e)}"
+
+
+def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 urllib 下载 URL（标准库，兜底方案）"""
+    headers = {
+        "User-Agent": USER_AGENT
+    }
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req, timeout=30) as response:
+            if response.status == 200:
+                content = response.read().decode("utf-8")
+                if not content or not content.strip():
+                    return None, "下载内容为空"
+                return content, None
+            return None, f"HTTP {response.status}"
+    except Exception as e:
+        return None, f"urllib 下载失败: {str(e)}"
+
+
+def download_html(url: str) -> Tuple[Optional[str], list]:
+    """
+    统一的 HTML 下载入口函数，按优先级尝试各下载器。
+
+    返回: (content, failures)
+    - content: 成功时返回 HTML 内容，失败时返回 None
+    - failures: 各下载器的失败原因列表
+    """
+    failures = []
+    content = None
+
+    # 按优先级尝试各下载器
+    downloaders = [
+        ("pyppeteer", download_with_pyppeteer),
+        ("selenium", download_with_selenium),
+        ("httpx", download_with_httpx),
+        ("urllib", download_with_urllib),
+    ]
+
+    for name, func in downloaders:
+        content, error = func(url)
+        if content is not None:
+            return content, failures
+        else:
+            failures.append(f"- {name}: {error}")
+
+    return None, failures
--- a/readers/html/html2text.py
+++ b/readers/html/html2text.py
@@ -0,0 +1,25 @@
+"""使用 html2text 解析 HTML（兜底方案）"""
+
+from typing import Optional, Tuple
+
+
+def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 html2text 解析 HTML（兜底方案）"""
+    try:
+        import html2text
+    except ImportError:
+        return None, "html2text 库未安装"
+
+    try:
+        converter = html2text.HTML2Text()
+        converter.ignore_emphasis = False
+        converter.ignore_links = False
+        converter.ignore_images = True
+        converter.body_width = 0
+        converter.skip_internal_links = True
+        markdown_content = converter.handle(html_content)
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"html2text 解析失败: {str(e)}"
--- a/readers/html/markitdown.py
+++ b/readers/html/markitdown.py
@@ -0,0 +1,41 @@
+"""使用 MarkItDown 解析 HTML"""
+
+import os
+import tempfile
+from typing import Optional, Tuple
+
+
+def parse(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
+    """使用 MarkItDown 解析 HTML"""
+    try:
+        from markitdown import MarkItDown
+    except ImportError:
+        return None, "MarkItDown 库未安装"
+
+    try:
+        input_path = temp_file_path
+        if not input_path or not os.path.exists(input_path):
+            # 创建临时文件
+            fd, input_path = tempfile.mkstemp(suffix='.html')
+            with os.fdopen(fd, 'w', encoding='utf-8') as f:
+                f.write(html_content)
+
+        md = MarkItDown()
+        result = md.convert(
+            input_path,
+            heading_style="ATX",
+            strip=["img", "script", "style", "noscript"],
+        )
+        markdown_content = result.text_content
+
+        if not temp_file_path:
+            try:
+                os.unlink(input_path)
+            except Exception:
+                pass
+
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"MarkItDown 解析失败: {str(e)}"
--- a/readers/html/trafilatura.py
+++ b/readers/html/trafilatura.py
@@ -0,0 +1,30 @@
+"""使用 trafilatura 解析 HTML"""
+
+from typing import Optional, Tuple
+
+
+def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 trafilatura 解析 HTML"""
+    try:
+        import trafilatura
+    except ImportError:
+        return None, "trafilatura 库未安装"
+
+    try:
+        markdown_content = trafilatura.extract(
+            html_content,
+            output_format="markdown",
+            include_formatting=True,
+            include_links=True,
+            include_images=False,
+            include_tables=True,
+            favor_recall=True,
+            include_comments=True,
+        )
+        if markdown_content is None:
+            return None, "trafilatura 返回 None"
+        if not markdown_content.strip():
+            return None, "解析内容为空"
+        return markdown_content, None
+    except Exception as e:
+        return None, f"trafilatura 解析失败: {str(e)}"