lyxy-document/scripts/readers/html/__init__.py

"""HTML/URL 文件阅读器，支持多种解析方法。"""

import os
import tempfile
from typing import List, Optional, Tuple

from scripts.readers.base import BaseReader
from scripts.utils import is_url
from scripts.utils import encoding_detection

from . import cleaner
from . import downloader
from . import trafilatura
from . import domscribe
from . import markitdown
from . import html2text


PARSERS = [
    ("trafilatura", trafilatura.parse),
    ("domscribe", domscribe.parse),
    ("MarkItDown", markitdown.parse),
    ("html2text", html2text.parse),
]


class HtmlReader(BaseReader):
    """HTML/URL 文件阅读器"""

    def supports(self, file_path: str) -> bool:
        return is_url(file_path) or file_path.lower().endswith(('.html', '.htm'))

    def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
        """解析 HTML 文件或 URL"""
        all_failures = []

        # 步骤 1: 获取 HTML 内容
        if is_url(file_path):
            # URL 路径: 下载 HTML
            html_content, download_failures = downloader.download_html(file_path)
            all_failures.extend(download_failures)
            if html_content is None:
                return None, all_failures
        else:
            # 本地文件路径: 读取文件
            if not os.path.exists(file_path):
                return None, ["文件不存在"]
            html_content, error = encoding_detection.read_text_file(file_path)
            if error:
                return None, [f"- {error}"]

        # 步骤 2: 清理 HTML 内容
        html_content = cleaner.clean_html_content(html_content)

        # 步骤 3: 对每个 Parser 创建独立的临时文件并尝试解析
        for parser_name, parser_func in PARSERS:
            # 创建临时文件
            fd, temp_file_path = tempfile.mkstemp(suffix='.html', text=True)
            try:
                # 写入清理后的 HTML 内容（UTF-8 编码）
                with os.fdopen(fd, 'w', encoding='utf-8') as f:
                    f.write(html_content)

                # 调用 Parser 解析
                content, error = parser_func(temp_file_path)
                if content is not None:
                    return content, all_failures
                else:
                    all_failures.append(f"- {parser_name}: {error}")
            finally:
                # 清理临时文件
                try:
                    os.unlink(temp_file_path)
                except Exception:
                    pass

        # 所有 Parser 都失败
        return None, all_failures