diff --git a/scripts/readers/base.py b/scripts/readers/base.py index 97edcfa..9f82bf4 100644 --- a/scripts/readers/base.py +++ b/scripts/readers/base.py @@ -38,5 +38,29 @@ class BaseReader(ABC): Returns: (content, failures) - content: 成功时返回 Markdown 内容,失败时返回 None - failures: 各解析器的失败原因列表 + + 异常处理规范: + ----------------- + 文档读取系统采用三级降级链设计,使用元组返回而非抛出异常: + + 1. Parser 层(最底层): + - 每个 parser 函数返回 (content, error) 元组 + - 必须捕获所有预期异常(ImportError, OSError, 解析异常等) + - 返回清晰的错误信息,如 "库未安装"、"解析失败: xxx" + + 2. Reader 层(中间层): + - 遍历多个 parser,收集失败原因 + - 必须在 parser 循环中添加 try-except 防护层 + - 捕获意外异常并记录:"[意外异常] ExceptionType: message" + - 任一 parser 成功即返回,失败则继续尝试下一个 + + 3. 调用层(最顶层): + - parse_input() 遍历多个 reader + - 无 reader 支持时抛出 ReaderNotFoundError + + 设计原则: + - "失败是预期分支,而非异常情况" + - 元组返回优于异常抛出(除顶层外) + - 双层异常保护:Parser 层处理预期错误,Reader 层捕获意外异常 """ pass diff --git a/scripts/readers/docx/__init__.py b/scripts/readers/docx/__init__.py index eec2fe3..be6ab5a 100644 --- a/scripts/readers/docx/__init__.py +++ b/scripts/readers/docx/__init__.py @@ -44,10 +44,13 @@ class DocxReader(BaseReader): content = None for parser_name, parser_func in PARSERS: - content, error = parser_func(file_path) - if content is not None: - return content, failures - else: - failures.append(f"- {parser_name}: {error}") + try: + content, error = parser_func(file_path) + if content is not None: + return content, failures + else: + failures.append(f"- {parser_name}: {error}") + except Exception as e: + failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") return None, failures diff --git a/scripts/readers/html/__init__.py b/scripts/readers/html/__init__.py index 9bef9e6..fb43d24 100644 --- a/scripts/readers/html/__init__.py +++ b/scripts/readers/html/__init__.py @@ -50,7 +50,11 @@ class HtmlReader(BaseReader): return None, [f"- {error}"] # 步骤 2: 清理 HTML 内容 - html_content = cleaner.clean_html_content(html_content) + cleaned_html, error = cleaner.clean_html_content(html_content) + if error: + all_failures.append(f"- cleaner: {error}") + return None, all_failures + html_content = cleaned_html # 步骤 3: 对每个 Parser 创建独立的临时文件并尝试解析 for parser_name, parser_func in PARSERS: @@ -61,12 +65,15 @@ class HtmlReader(BaseReader): with os.fdopen(fd, 'w', encoding='utf-8') as f: f.write(html_content) - # 调用 Parser 解析 - content, error = parser_func(temp_file_path) - if content is not None: - return content, all_failures - else: - all_failures.append(f"- {parser_name}: {error}") + # 调用 Parser 解析(添加防护层) + try: + content, error = parser_func(temp_file_path) + if content is not None: + return content, all_failures + else: + all_failures.append(f"- {parser_name}: {error}") + except Exception as e: + all_failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") finally: # 清理临时文件 try: diff --git a/scripts/readers/html/cleaner.py b/scripts/readers/html/cleaner.py index 2c52c9d..7ba0d87 100644 --- a/scripts/readers/html/cleaner.py +++ b/scripts/readers/html/cleaner.py @@ -1,69 +1,87 @@ """HTML 清理模块,用于清理 HTML 内容中的敏感信息。""" import re +from typing import Optional, Tuple + from bs4 import BeautifulSoup -def clean_html_content(html_content: str) -> str: - """清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。""" - soup = BeautifulSoup(html_content, "html.parser") +def clean_html_content(html_content: str) -> Tuple[Optional[str], Optional[str]]: + """ + 清理 HTML 内容,移除 script/style/link/svg 标签和 URL 属性。 - # Remove all script tags - for script in soup.find_all("script"): - script.decompose() + Returns: + (content, error): 成功时返回 (清理后的 HTML, None),失败时返回 (None, 错误信息) + """ + try: + from bs4 import BeautifulSoup + except ImportError: + return None, "beautifulsoup4 库未安装" - # Remove all style tags - for style in soup.find_all("style"): - style.decompose() + try: + soup = BeautifulSoup(html_content, "html.parser") - # Remove all svg tags - for svg in soup.find_all("svg"): - svg.decompose() + # Remove all script tags + for script in soup.find_all("script"): + script.decompose() - # Remove all link tags - for link in soup.find_all("link"): - link.decompose() + # Remove all style tags + for style in soup.find_all("style"): + style.decompose() - # Remove URLs from href and src attributes - for tag in soup.find_all(True): - if "href" in tag.attrs: - del tag["href"] - if "src" in tag.attrs: - del tag["src"] - if "srcset" in tag.attrs: - del tag["srcset"] - if "action" in tag.attrs: - del tag["action"] - data_attrs = [ - attr - for attr in tag.attrs - if attr.startswith("data-") and "src" in attr.lower() - ] - for attr in data_attrs: - del tag[attr] + # Remove all svg tags + for svg in soup.find_all("svg"): + svg.decompose() - # Remove all style attributes from all tags - for tag in soup.find_all(True): - if "style" in tag.attrs: - del tag["style"] + # Remove all link tags + for link in soup.find_all("link"): + link.decompose() - # Remove data-href attributes - for tag in soup.find_all(True): - if "data-href" in tag.attrs: - del tag["data-href"] + # Remove URLs from href and src attributes + for tag in soup.find_all(True): + if "href" in tag.attrs: + del tag["href"] + if "src" in tag.attrs: + del tag["src"] + if "srcset" in tag.attrs: + del tag["srcset"] + if "action" in tag.attrs: + del tag["action"] + data_attrs = [ + attr + for attr in tag.attrs + if attr.startswith("data-") and "src" in attr.lower() + ] + for attr in data_attrs: + del tag[attr] - # Remove URLs from title attributes - for tag in soup.find_all(True): - if "title" in tag.attrs: - title = tag["title"] - cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE) - tag["title"] = cleaned_title + # Remove all style attributes from all tags + for tag in soup.find_all(True): + if "style" in tag.attrs: + del tag["style"] - # Remove class attributes that contain URL-like patterns - for tag in soup.find_all(True): - if "class" in tag.attrs: - classes = tag["class"] - cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c] - tag["class"] = cleaned_classes + # Remove data-href attributes + for tag in soup.find_all(True): + if "data-href" in tag.attrs: + del tag["data-href"] - return str(soup) + # Remove URLs from title attributes + for tag in soup.find_all(True): + if "title" in tag.attrs: + title = tag["title"] + cleaned_title = re.sub(r"https?://\S+", "", title, flags=re.IGNORECASE) + tag["title"] = cleaned_title + + # Remove class attributes that contain URL-like patterns + for tag in soup.find_all(True): + if "class" in tag.attrs: + classes = tag["class"] + cleaned_classes = [c for c in classes if not c.startswith("url ") and not "hyperlink-href:" in c] + tag["class"] = cleaned_classes + + content = str(soup) + if not content.strip(): + return None, "清理后内容为空" + return content, None + except Exception as e: + return None, f"BeautifulSoup 解析失败: {str(e)}" diff --git a/scripts/readers/pdf/__init__.py b/scripts/readers/pdf/__init__.py index 9f175b0..bf54a1b 100644 --- a/scripts/readers/pdf/__init__.py +++ b/scripts/readers/pdf/__init__.py @@ -44,10 +44,13 @@ class PdfReader(BaseReader): content = None for parser_name, parser_func in PARSERS: - content, error = parser_func(file_path) - if content is not None: - return content, failures - else: - failures.append(f"- {parser_name}: {error}") + try: + content, error = parser_func(file_path) + if content is not None: + return content, failures + else: + failures.append(f"- {parser_name}: {error}") + except Exception as e: + failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") return None, failures diff --git a/scripts/readers/pptx/__init__.py b/scripts/readers/pptx/__init__.py index 6a92910..be12f43 100644 --- a/scripts/readers/pptx/__init__.py +++ b/scripts/readers/pptx/__init__.py @@ -42,10 +42,13 @@ class PptxReader(BaseReader): content = None for parser_name, parser_func in PARSERS: - content, error = parser_func(file_path) - if content is not None: - return content, failures - else: - failures.append(f"- {parser_name}: {error}") + try: + content, error = parser_func(file_path) + if content is not None: + return content, failures + else: + failures.append(f"- {parser_name}: {error}") + except Exception as e: + failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") return None, failures diff --git a/scripts/readers/xlsx/__init__.py b/scripts/readers/xlsx/__init__.py index 8856e97..f86b122 100644 --- a/scripts/readers/xlsx/__init__.py +++ b/scripts/readers/xlsx/__init__.py @@ -42,10 +42,13 @@ class XlsxReader(BaseReader): content = None for parser_name, parser_func in PARSERS: - content, error = parser_func(file_path) - if content is not None: - return content, failures - else: - failures.append(f"- {parser_name}: {error}") + try: + content, error = parser_func(file_path) + if content is not None: + return content, failures + else: + failures.append(f"- {parser_name}: {error}") + except Exception as e: + failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") return None, failures