"""HTML/URL 文件阅读器,支持多种解析方法。""" import os import tempfile from typing import List, Optional, Tuple from scripts.readers.base import BaseReader from scripts.utils import is_url from scripts.utils import encoding_detection from . import cleaner from .downloader import download_html from . import trafilatura from . import domscribe from . import markitdown from . import html2text PARSERS = [ ("trafilatura", trafilatura.parse), ("domscribe", domscribe.parse), ("MarkItDown", markitdown.parse), ("html2text", html2text.parse), ] class HtmlReader(BaseReader): """HTML/URL 文件阅读器""" def supports(self, file_path: str) -> bool: return is_url(file_path) or file_path.lower().endswith(('.html', '.htm')) def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: """解析 HTML 文件或 URL""" all_failures = [] # 步骤 1: 获取 HTML 内容 if is_url(file_path): # URL 路径: 下载 HTML html_content, download_failures = download_html(file_path) all_failures.extend(download_failures) if html_content is None: return None, all_failures else: # 本地文件路径: 读取文件 if not os.path.exists(file_path): return None, ["文件不存在"] html_content, error = encoding_detection.read_text_file(file_path) if error: return None, [f"- {error}"] # 步骤 2: 清理 HTML 内容 cleaned_html, error = cleaner.clean_html_content(html_content) if error: all_failures.append(f"- cleaner: {error}") return None, all_failures html_content = cleaned_html # 步骤 3: 对每个 Parser 创建独立的临时文件并尝试解析 for parser_name, parser_func in PARSERS: # 创建临时文件 fd, temp_file_path = tempfile.mkstemp(suffix='.html', text=True) try: # 写入清理后的 HTML 内容(UTF-8 编码) with os.fdopen(fd, 'w', encoding='utf-8') as f: f.write(html_content) # 调用 Parser 解析(添加防护层) try: content, error = parser_func(temp_file_path) if content is not None: return content, all_failures else: all_failures.append(f"- {parser_name}: {error}") except Exception as e: all_failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") finally: # 清理临时文件 try: os.unlink(temp_file_path) except Exception: pass # 所有 Parser 都失败 return None, all_failures