"""HTML/URL 文件阅读器,支持多种解析方法。""" import os from typing import List, Optional, Tuple from readers.base import BaseReader from utils import is_url import encoding_detection from . import cleaner from . import downloader from . import trafilatura from . import domscribe from . import markitdown from . import html2text PARSERS = [ ("trafilatura", lambda c, t: trafilatura.parse(c)), ("domscribe", lambda c, t: domscribe.parse(c)), ("MarkItDown", lambda c, t: markitdown.parse(c, t)), ("html2text", lambda c, t: html2text.parse(c)), ] class HtmlReader(BaseReader): """HTML/URL 文件阅读器""" @property def supported_extensions(self) -> List[str]: return [".html", ".htm"] def supports(self, file_path: str) -> bool: return is_url(file_path) or file_path.endswith(('.html', '.htm')) def download_and_parse(self, url: str) -> Tuple[Optional[str], List[str]]: """下载 URL 并解析""" all_failures = [] # 下载 HTML html_content, download_failures = downloader.download_html(url) all_failures.extend(download_failures) if html_content is None: return None, all_failures # 清理 HTML html_content = cleaner.clean_html_content(html_content) # 解析 HTML content, parse_failures = self._parse_html_content(html_content, None) all_failures.extend(parse_failures) return content, all_failures def _parse_html_content(self, html_content: str, temp_file_path: Optional[str]) -> Tuple[Optional[str], List[str]]: """解析 HTML 内容""" failures = [] content = None for parser_name, parser_func in PARSERS: content, error = parser_func(html_content, temp_file_path) if content is not None: return content, failures else: failures.append(f"- {parser_name}: {error}") return None, failures def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: all_failures = [] # 判断输入类型 if is_url(file_path): return self.download_and_parse(file_path) # 读取本地 HTML 文件,使用编码检测 html_content, error = encoding_detection.read_text_file(file_path) if error: return None, [f"- {error}"] # 清理 HTML html_content = cleaner.clean_html_content(html_content) # 解析 HTML content, parse_failures = self._parse_html_content(html_content, file_path) all_failures.extend(parse_failures) return content, all_failures