Files
lyxy-document/scripts/readers/html/__init__.py
lanyuanxiaoyao b80c635f07 refactor: 完善降级链的异常捕获机制
为所有 Reader 的 parser 循环添加 try-except 防护层,确保即使 parser
抛出意外异常,降级链也能继续尝试下一个 parser。

主要变更:
- 所有 Reader (DocxReader/PdfReader/XlsxReader/PptxReader/HtmlReader)
  的 parse 方法中添加防护层,捕获意外异常并标记为 [意外异常]
- cleaner.clean_html_content() 添加异常处理,返回 (content, error) 元组
- HtmlReader.parse() 更新 cleaner 调用方式,处理新的返回值格式
- BaseReader 添加详细的异常处理规范文档

设计原则:双层异常保护
- Parser 层:捕获预期的解析失败(库未安装、格式不支持)
- Reader 层:捕获意外的编程错误(NoneType、索引越界等)
2026-03-09 00:26:51 +08:00

86 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""HTML/URL 文件阅读器,支持多种解析方法。"""
import os
import tempfile
from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader
from scripts.utils import is_url
from scripts.utils import encoding_detection
from . import cleaner
from . import downloader
from . import trafilatura
from . import domscribe
from . import markitdown
from . import html2text
PARSERS = [
("trafilatura", trafilatura.parse),
("domscribe", domscribe.parse),
("MarkItDown", markitdown.parse),
("html2text", html2text.parse),
]
class HtmlReader(BaseReader):
"""HTML/URL 文件阅读器"""
def supports(self, file_path: str) -> bool:
return is_url(file_path) or file_path.lower().endswith(('.html', '.htm'))
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
"""解析 HTML 文件或 URL"""
all_failures = []
# 步骤 1: 获取 HTML 内容
if is_url(file_path):
# URL 路径: 下载 HTML
html_content, download_failures = downloader.download_html(file_path)
all_failures.extend(download_failures)
if html_content is None:
return None, all_failures
else:
# 本地文件路径: 读取文件
if not os.path.exists(file_path):
return None, ["文件不存在"]
html_content, error = encoding_detection.read_text_file(file_path)
if error:
return None, [f"- {error}"]
# 步骤 2: 清理 HTML 内容
cleaned_html, error = cleaner.clean_html_content(html_content)
if error:
all_failures.append(f"- cleaner: {error}")
return None, all_failures
html_content = cleaned_html
# 步骤 3: 对每个 Parser 创建独立的临时文件并尝试解析
for parser_name, parser_func in PARSERS:
# 创建临时文件
fd, temp_file_path = tempfile.mkstemp(suffix='.html', text=True)
try:
# 写入清理后的 HTML 内容UTF-8 编码)
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write(html_content)
# 调用 Parser 解析(添加防护层)
try:
content, error = parser_func(temp_file_path)
if content is not None:
return content, all_failures
else:
all_failures.append(f"- {parser_name}: {error}")
except Exception as e:
all_failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}")
finally:
# 清理临时文件
try:
os.unlink(temp_file_path)
except Exception:
pass
# 所有 Parser 都失败
return None, all_failures