feat: 统一文档解析器项目 - 迁移 lyxy-reader-office 和 lyxy-reader-html

## 功能特性

- 建立统一的项目结构,包含 core/、readers/、utils/、tests/ 模块
- 迁移 lyxy-reader-office 的所有解析器(docx、xlsx、pptx、pdf)
- 迁移 lyxy-reader-html 的所有解析器(html、url 下载)
- 统一 CLI 入口为 lyxy_document_reader.py
- 统一 Markdown 后处理逻辑
- 按文件类型组织 readers,每个解析器独立文件
- 依赖分组按文件类型细分(docx、xlsx、pptx、pdf、html、http)
- PDF OCR 解析器优先,无参数控制
- 使用 logging 模块替代简单 print
- 设计完整的单元测试结构
- 重写项目文档

## 新增目录/文件

- core/ - 核心模块(异常体系、Markdown 工具、解析调度器)
- readers/ - 格式阅读器(base.py + docx/xlsx/pptx/pdf/html)
- utils/ - 工具函数(文件类型检测)
- tests/ - 测试(conftest.py + test_core/ + test_readers/ + test_utils/)
- lyxy_document_reader.py - 统一 CLI 入口

## 依赖分组

- docx - DOCX 文档解析支持
- xlsx - XLSX 文档解析支持
- pptx - PPTX 文档解析支持
- pdf - PDF 文档解析支持(含 OCR)
- html - HTML/URL 解析支持
- http - HTTP/URL 下载支持
- office - Office 格式组合(docx/xlsx/pptx/pdf)
- web - Web 格式组合(html/http)
- full - 完整功能
- dev - 开发依赖
This commit is contained in:
2026-03-08 13:46:37 +08:00
parent eb8973495e
commit 833018d451
66 changed files with 4054 additions and 0 deletions

89
readers/html/__init__.py Normal file
View File

@@ -0,0 +1,89 @@
"""HTML/URL 文件阅读器,支持多种解析方法。"""
import os
from typing import List, Optional, Tuple
from readers.base import BaseReader
from utils import is_html_file, is_url
from . import cleaner
from . import downloader
from . import trafilatura
from . import domscribe
from . import markitdown
from . import html2text
PARSERS = [
("trafilatura", lambda c, t: trafilatura.parse(c)),
("domscribe", lambda c, t: domscribe.parse(c)),
("MarkItDown", lambda c, t: markitdown.parse(c, t)),
("html2text", lambda c, t: html2text.parse(c)),
]
class HtmlReader(BaseReader):
"""HTML/URL 文件阅读器"""
@property
def supported_extensions(self) -> List[str]:
return [".html", ".htm"]
def supports(self, file_path: str) -> bool:
return is_url(file_path) or is_html_file(file_path)
def download_and_parse(self, url: str) -> Tuple[Optional[str], List[str]]:
"""下载 URL 并解析"""
all_failures = []
# 下载 HTML
html_content, download_failures = downloader.download_html(url)
all_failures.extend(download_failures)
if html_content is None:
return None, all_failures
# 清理 HTML
html_content = cleaner.clean_html_content(html_content)
# 解析 HTML
content, parse_failures = self._parse_html_content(html_content, None)
all_failures.extend(parse_failures)
return content, all_failures
def _parse_html_content(self, html_content: str, temp_file_path: Optional[str]) -> Tuple[Optional[str], List[str]]:
"""解析 HTML 内容"""
failures = []
content = None
for parser_name, parser_func in PARSERS:
content, error = parser_func(html_content, temp_file_path)
if content is not None:
return content, failures
else:
failures.append(f"- {parser_name}: {error}")
return None, failures
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
all_failures = []
if is_url(file_path):
return self.download_and_parse(file_path)
# 读取 HTML 文件
try:
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
except Exception as e:
return None, [f"- 读取文件失败: {str(e)}"]
# 清理 HTML
html_content = cleaner.clean_html_content(html_content)
# 解析 HTML
content, parse_failures = self._parse_html_content(html_content, file_path)
all_failures.extend(parse_failures)
return content, all_failures

69
readers/html/cleaner.py Normal file
View File

@@ -0,0 +1,69 @@
"""HTML 清理模块,用于清理 HTML 内容中的敏感信息。"""
import re
from bs4 import BeautifulSoup
def clean_html_content(html_content: str) -> str:
"""清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。"""
soup = BeautifulSoup(html_content, "html.parser")
# Remove all script tags
for script in soup.find_all("script"):
script.decompose()
# Remove all style tags
for style in soup.find_all("style"):
style.decompose()
# Remove all svg tags
for svg in soup.find_all("svg"):
svg.decompose()
# Remove all link tags
for link in soup.find_all("link"):
link.decompose()
# Remove URLs from href and src attributes
for tag in soup.find_all(True):
if "href" in tag.attrs:
del tag["href"]
if "src" in tag.attrs:
del tag["src"]
if "srcset" in tag.attrs:
del tag["srcset"]
if "action" in tag.attrs:
del tag["action"]
data_attrs = [
attr
for attr in tag.attrs
if attr.startswith("data-") and "src" in attr.lower()
]
for attr in data_attrs:
del tag[attr]
# Remove all style attributes from all tags
for tag in soup.find_all(True):
if "style" in tag.attrs:
del tag["style"]
# Remove data-href attributes
for tag in soup.find_all(True):
if "data-href" in tag.attrs:
del tag["data-href"]
# Remove URLs from title attributes
for tag in soup.find_all(True):
if "title" in tag.attrs:
title = tag["title"]
cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE)
tag["title"] = cleaned_title
# Remove class attributes that contain URL-like patterns
for tag in soup.find_all(True):
if "class" in tag.attrs:
classes = tag["class"]
cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c]
tag["class"] = cleaned_classes
return str(soup)

22
readers/html/domscribe.py Normal file
View File

@@ -0,0 +1,22 @@
"""使用 domscribe 解析 HTML"""
from typing import Optional, Tuple
def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 domscribe 解析 HTML"""
try:
from domscribe import html_to_markdown
except ImportError:
return None, "domscribe 库未安装"
try:
options = {
'extract_main_content': True,
}
markdown_content = html_to_markdown(html_content, options)
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"domscribe 解析失败: {str(e)}"

262
readers/html/downloader.py Normal file
View File

@@ -0,0 +1,262 @@
"""URL 下载模块,按 pyppeteer → selenium → httpx → urllib 优先级尝试下载。"""
import os
import asyncio
import tempfile
import urllib.request
import urllib.error
from typing import Optional, Tuple
# 公共配置
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
WINDOW_SIZE = "1920,1080"
LANGUAGE_SETTING = "zh-CN,zh"
# Chrome 浏览器启动参数pyppeteer 和 selenium 共用)
CHROME_ARGS = [
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--disable-software-rasterizer",
"--disable-extensions",
"--disable-background-networking",
"--disable-default-apps",
"--disable-sync",
"--disable-translate",
"--hide-scrollbars",
"--metrics-recording-only",
"--mute-audio",
"--no-first-run",
"--safebrowsing-disable-auto-update",
"--blink-settings=imagesEnabled=false",
"--disable-plugins",
"--disable-ipc-flooding-protection",
"--disable-renderer-backgrounding",
"--disable-background-timer-throttling",
"--disable-hang-monitor",
"--disable-prompt-on-repost",
"--disable-client-side-phishing-detection",
"--disable-component-update",
"--disable-domain-reliability",
"--disable-features=site-per-process",
"--disable-features=IsolateOrigins",
"--disable-features=VizDisplayCompositor",
"--disable-features=WebRTC",
f"--window-size={WINDOW_SIZE}",
f"--lang={LANGUAGE_SETTING}",
f"--user-agent={USER_AGENT}",
]
# 隐藏自动化特征的脚本pyppeteer 和 selenium 共用)
HIDE_AUTOMATION_SCRIPT = """
() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
}
"""
# pyppeteer 额外的隐藏自动化脚本(包含 notifications 处理)
HIDE_AUTOMATION_SCRIPT_PUPPETEER = """
() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
}
"""
def download_with_pyppeteer(url: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 pyppeteer 下载 URL支持 JS 渲染)"""
try:
from pyppeteer import launch
except ImportError:
return None, "pyppeteer 库未安装"
async def _download():
pyppeteer_temp_dir = os.path.join(tempfile.gettempdir(), "pyppeteer_home")
chromium_path = os.environ.get("LYXY_CHROMIUM_BINARY")
if not chromium_path:
os.environ["PYPPETEER_HOME"] = pyppeteer_temp_dir
executable_path = chromium_path if (chromium_path and os.path.exists(chromium_path)) else None
browser = None
try:
browser = await launch(
headless=True,
executablePath=executable_path,
args=CHROME_ARGS
)
page = await browser.newPage()
await page.evaluateOnNewDocument(HIDE_AUTOMATION_SCRIPT_PUPPETEER)
await page.setJavaScriptEnabled(True)
await page.goto(url, {"waitUntil": "networkidle2", "timeout": 30000})
return await page.content()
finally:
if browser is not None:
try:
await browser.close()
except Exception:
pass
try:
content = asyncio.run(_download())
if not content or not content.strip():
return None, "下载内容为空"
return content, None
except Exception as e:
return None, f"pyppeteer 下载失败: {str(e)}"
def download_with_selenium(url: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 selenium 下载 URL支持 JS 渲染)"""
try:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
except ImportError:
return None, "selenium 库未安装"
driver_path = os.environ.get("LYXY_CHROMIUM_DRIVER")
binary_path = os.environ.get("LYXY_CHROMIUM_BINARY")
if not driver_path or not os.path.exists(driver_path):
return None, "LYXY_CHROMIUM_DRIVER 环境变量未设置或文件不存在"
if not binary_path or not os.path.exists(binary_path):
return None, "LYXY_CHROMIUM_BINARY 环境变量未设置或文件不存在"
chrome_options = Options()
chrome_options.binary_location = binary_path
chrome_options.add_argument("--headless=new")
for arg in CHROME_ARGS:
chrome_options.add_argument(arg)
# 隐藏自动化特征
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
driver = None
try:
import time
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
# 隐藏 webdriver 属性
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": HIDE_AUTOMATION_SCRIPT
})
driver.get(url)
# 等待页面内容稳定
WebDriverWait(driver, 30).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
last_len = 0
stable_count = 0
for _ in range(30):
current_len = len(driver.page_source)
if current_len == last_len:
stable_count += 1
if stable_count >= 2:
break
else:
stable_count = 0
last_len = current_len
time.sleep(0.5)
content = driver.page_source
if not content or not content.strip():
return None, "下载内容为空"
return content, None
except Exception as e:
return None, f"selenium 下载失败: {str(e)}"
finally:
if driver is not None:
try:
driver.quit()
except Exception:
pass
def download_with_httpx(url: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 httpx 下载 URL轻量级 HTTP 客户端)"""
try:
import httpx
except ImportError:
return None, "httpx 库未安装"
headers = {
"User-Agent": USER_AGENT
}
try:
with httpx.Client(timeout=30.0) as client:
response = client.get(url, headers=headers)
if response.status_code == 200:
content = response.text
if not content or not content.strip():
return None, "下载内容为空"
return content, None
return None, f"HTTP {response.status_code}"
except Exception as e:
return None, f"httpx 下载失败: {str(e)}"
def download_with_urllib(url: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 urllib 下载 URL标准库兜底方案"""
headers = {
"User-Agent": USER_AGENT
}
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=30) as response:
if response.status == 200:
content = response.read().decode("utf-8")
if not content or not content.strip():
return None, "下载内容为空"
return content, None
return None, f"HTTP {response.status}"
except Exception as e:
return None, f"urllib 下载失败: {str(e)}"
def download_html(url: str) -> Tuple[Optional[str], list]:
"""
统一的 HTML 下载入口函数,按优先级尝试各下载器。
返回: (content, failures)
- content: 成功时返回 HTML 内容,失败时返回 None
- failures: 各下载器的失败原因列表
"""
failures = []
content = None
# 按优先级尝试各下载器
downloaders = [
("pyppeteer", download_with_pyppeteer),
("selenium", download_with_selenium),
("httpx", download_with_httpx),
("urllib", download_with_urllib),
]
for name, func in downloaders:
content, error = func(url)
if content is not None:
return content, failures
else:
failures.append(f"- {name}: {error}")
return None, failures

25
readers/html/html2text.py Normal file
View File

@@ -0,0 +1,25 @@
"""使用 html2text 解析 HTML兜底方案"""
from typing import Optional, Tuple
def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 html2text 解析 HTML兜底方案"""
try:
import html2text
except ImportError:
return None, "html2text 库未安装"
try:
converter = html2text.HTML2Text()
converter.ignore_emphasis = False
converter.ignore_links = False
converter.ignore_images = True
converter.body_width = 0
converter.skip_internal_links = True
markdown_content = converter.handle(html_content)
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"html2text 解析失败: {str(e)}"

View File

@@ -0,0 +1,41 @@
"""使用 MarkItDown 解析 HTML"""
import os
import tempfile
from typing import Optional, Tuple
def parse(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 解析 HTML"""
try:
from markitdown import MarkItDown
except ImportError:
return None, "MarkItDown 库未安装"
try:
input_path = temp_file_path
if not input_path or not os.path.exists(input_path):
# 创建临时文件
fd, input_path = tempfile.mkstemp(suffix='.html')
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write(html_content)
md = MarkItDown()
result = md.convert(
input_path,
heading_style="ATX",
strip=["img", "script", "style", "noscript"],
)
markdown_content = result.text_content
if not temp_file_path:
try:
os.unlink(input_path)
except Exception:
pass
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"

View File

@@ -0,0 +1,30 @@
"""使用 trafilatura 解析 HTML"""
from typing import Optional, Tuple
def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 trafilatura 解析 HTML"""
try:
import trafilatura
except ImportError:
return None, "trafilatura 库未安装"
try:
markdown_content = trafilatura.extract(
html_content,
output_format="markdown",
include_formatting=True,
include_links=True,
include_images=False,
include_tables=True,
favor_recall=True,
include_comments=True,
)
if markdown_content is None:
return None, "trafilatura 返回 None"
if not markdown_content.strip():
return None, "解析内容为空"
return markdown_content, None
except Exception as e:
return None, f"trafilatura 解析失败: {str(e)}"