"""HTML/URL 文件阅读器,支持多种解析方法。""" import os from typing import List, Optional, Tuple from readers.base import BaseReader from utils import is_html_file, is_url from . import cleaner from . import downloader from . import trafilatura from . import domscribe from . import markitdown from . import html2text PARSERS = [ ("trafilatura", lambda c, t: trafilatura.parse(c)), ("domscribe", lambda c, t: domscribe.parse(c)), ("MarkItDown", lambda c, t: markitdown.parse(c, t)), ("html2text", lambda c, t: html2text.parse(c)), ] class HtmlReader(BaseReader): """HTML/URL 文件阅读器""" @property def supported_extensions(self) -> List[str]: return [".html", ".htm"] def supports(self, file_path: str) -> bool: return is_url(file_path) or is_html_file(file_path) def download_and_parse(self, url: str) -> Tuple[Optional[str], List[str]]: """下载 URL 并解析""" all_failures = [] # 下载 HTML html_content, download_failures = downloader.download_html(url) all_failures.extend(download_failures) if html_content is None: return None, all_failures # 清理 HTML html_content = cleaner.clean_html_content(html_content) # 解析 HTML content, parse_failures = self._parse_html_content(html_content, None) all_failures.extend(parse_failures) return content, all_failures def _parse_html_content(self, html_content: str, temp_file_path: Optional[str]) -> Tuple[Optional[str], List[str]]: """解析 HTML 内容""" failures = [] content = None for parser_name, parser_func in PARSERS: content, error = parser_func(html_content, temp_file_path) if content is not None: return content, failures else: failures.append(f"- {parser_name}: {error}") return None, failures def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: all_failures = [] if is_url(file_path): return self.download_and_parse(file_path) # 读取 HTML 文件 try: with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() except Exception as e: return None, [f"- 读取文件失败: {str(e)}"] # 清理 HTML html_content = cleaner.clean_html_content(html_content) # 解析 HTML content, parse_failures = self._parse_html_content(html_content, file_path) all_failures.extend(parse_failures) return content, all_failures