Files
lyxy-document/scripts/readers/html/__init__.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

86 lines
2.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""HTML/URL 文件阅读器,支持多种解析方法。"""
import os
import tempfile
from typing import List, Optional, Tuple
from readers.base import BaseReader
from utils import is_url
from utils import encoding_detection
from . import cleaner
from .downloader import download_html
from . import trafilatura
from . import domscribe
from . import markitdown
from . import html2text
PARSERS = [
("trafilatura", trafilatura.parse),
("domscribe", domscribe.parse),
("MarkItDown", markitdown.parse),
("html2text", html2text.parse),
]
class HtmlReader(BaseReader):
"""HTML/URL 文件阅读器"""
def supports(self, file_path: str) -> bool:
return is_url(file_path) or file_path.lower().endswith(('.html', '.htm'))
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
"""解析 HTML 文件或 URL"""
all_failures = []
# 步骤 1: 获取 HTML 内容
if is_url(file_path):
# URL 路径: 下载 HTML
html_content, download_failures = download_html(file_path)
all_failures.extend(download_failures)
if html_content is None:
return None, all_failures
else:
# 本地文件路径: 读取文件
if not os.path.exists(file_path):
return None, ["文件不存在"]
html_content, error = encoding_detection.read_text_file(file_path)
if error:
return None, [f"- {error}"]
# 步骤 2: 清理 HTML 内容
cleaned_html, error = cleaner.clean_html_content(html_content)
if error:
all_failures.append(f"- cleaner: {error}")
return None, all_failures
html_content = cleaned_html
# 步骤 3: 对每个 Parser 创建独立的临时文件并尝试解析
for parser_name, parser_func in PARSERS:
# 创建临时文件
fd, temp_file_path = tempfile.mkstemp(suffix='.html', text=True)
try:
# 写入清理后的 HTML 内容UTF-8 编码)
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write(html_content)
# 调用 Parser 解析(添加防护层)
try:
content, error = parser_func(temp_file_path)
if content is not None:
return content, all_failures
else:
all_failures.append(f"- {parser_name}: {error}")
except Exception as e:
all_failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}")
finally:
# 清理临时文件
try:
os.unlink(temp_file_path)
except Exception:
pass
# 所有 Parser 都失败
return None, all_failures