diff --git a/openspec/specs/html-reader/spec.md b/openspec/specs/html-reader/spec.md index e21b217..ae9992f 100644 --- a/openspec/specs/html-reader/spec.md +++ b/openspec/specs/html-reader/spec.md @@ -28,7 +28,7 @@ HTML/URL 文档解析能力,支持多种解析方法。 #### Scenario: HTML reader 解析 URL - **WHEN** 调用 HtmlReader.parse(input_path) 且 input_path 为 URL -- **THEN** 系统调用内部的 download_and_parse() 方法 +- **THEN** 系统在内部处理 URL 下载和解析 #### Scenario: 按优先级尝试下载器 - **WHEN** 输入为 URL @@ -215,19 +215,41 @@ HTML/URL 文档解析能力,支持多种解析方法。 - **THEN** 系统返回错误信息包含编码检测失败原因 ### Requirement: HTML reader 统一处理 URL 和文件 -系统 SHALL 在 HTML reader 的 parse() 方法中统一处理 URL 和本地文件。 +系统 SHALL 在 HTML reader 的 `parse()` 方法中统一处理 URL 和本地文件,无需单独的 `download_and_parse()` 方法。 #### Scenario: parse() 判断输入类型 -- **WHEN** 调用 HtmlReader.parse(file_path) +- **WHEN** 调用 `HtmlReader.parse(file_path)` - **THEN** 系统判断 file_path 是 URL 还是本地文件 -#### Scenario: URL 调用 download_and_parse +#### Scenario: URL 下载后解析 - **WHEN** file_path 为 URL -- **THEN** 系统调用 self.download_and_parse(file_path) +- **THEN** 系统下载 HTML 内容 +- **AND** 清理 HTML 内容 +- **AND** 创建临时文件(UTF-8 编码) +- **AND** 传递临时文件路径给各 parser #### Scenario: 本地文件读取并解析 - **WHEN** file_path 为本地文件 -- **THEN** 系统使用编码检测读取文件,然后解析 HTML 内容 +- **THEN** 系统使用编码检测读取文件 +- **AND** 清理 HTML 内容 +- **AND** 创建临时文件(UTF-8 编码)或使用原文件路径 +- **AND** 传递文件路径给各 parser + +### Requirement: 每个 HTML Parser 接收文件路径 +每个 HTML parser SHALL 接收文件路径作为输入,而非 HTML 字符串。 + +#### Scenario: Parser 接收 file_path +- **WHEN** `HtmlReader.parse()` 调用 parser +- **THEN** parser 接收 `file_path: str` 参数 + +#### Scenario: Parser 内部读取文件 +- **WHEN** parser 解析 HTML +- **THEN** parser 内部使用 UTF-8 编码读取文件内容 + +#### Scenario: Parser 独立临时文件 +- **WHEN** 多个 parser 依次尝试 +- **THEN** 每个 parser 使用独立的临时文件副本 +- **AND** 用完后立即清理临时文件 ### Requirement: HTML reader supports() 支持 URL 系统 SHALL 在 HTML reader 的 supports() 方法中同时支持 URL 和 HTML 文件扩展名。 diff --git a/scripts/readers/html/__init__.py b/scripts/readers/html/__init__.py index ebbc476..9bef9e6 100644 --- a/scripts/readers/html/__init__.py +++ b/scripts/readers/html/__init__.py @@ -1,6 +1,7 @@ """HTML/URL 文件阅读器,支持多种解析方法。""" import os +import tempfile from typing import List, Optional, Tuple from scripts.readers.base import BaseReader @@ -16,10 +17,10 @@ from . import html2text PARSERS = [ - ("trafilatura", lambda c, t: trafilatura.parse(c)), - ("domscribe", lambda c, t: domscribe.parse(c)), - ("MarkItDown", lambda c, t: markitdown.parse(c, t)), - ("html2text", lambda c, t: html2text.parse(c)), + ("trafilatura", trafilatura.parse), + ("domscribe", domscribe.parse), + ("MarkItDown", markitdown.parse), + ("html2text", html2text.parse), ] @@ -29,61 +30,49 @@ class HtmlReader(BaseReader): def supports(self, file_path: str) -> bool: return is_url(file_path) or file_path.lower().endswith(('.html', '.htm')) - def download_and_parse(self, url: str) -> Tuple[Optional[str], List[str]]: - """下载 URL 并解析""" - all_failures = [] - - # 下载 HTML - html_content, download_failures = downloader.download_html(url) - all_failures.extend(download_failures) - - if html_content is None: - return None, all_failures - - # 清理 HTML - html_content = cleaner.clean_html_content(html_content) - - # 解析 HTML - content, parse_failures = self._parse_html_content(html_content, None) - all_failures.extend(parse_failures) - - return content, all_failures - - def _parse_html_content(self, html_content: str, temp_file_path: Optional[str]) -> Tuple[Optional[str], List[str]]: - """解析 HTML 内容""" - failures = [] - content = None - - for parser_name, parser_func in PARSERS: - content, error = parser_func(html_content, temp_file_path) - if content is not None: - return content, failures - else: - failures.append(f"- {parser_name}: {error}") - - return None, failures - def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: + """解析 HTML 文件或 URL""" all_failures = [] - # 判断输入类型 + # 步骤 1: 获取 HTML 内容 if is_url(file_path): - return self.download_and_parse(file_path) + # URL 路径: 下载 HTML + html_content, download_failures = downloader.download_html(file_path) + all_failures.extend(download_failures) + if html_content is None: + return None, all_failures + else: + # 本地文件路径: 读取文件 + if not os.path.exists(file_path): + return None, ["文件不存在"] + html_content, error = encoding_detection.read_text_file(file_path) + if error: + return None, [f"- {error}"] - # 检查文件是否存在 - if not os.path.exists(file_path): - return None, ["文件不存在"] - - # 读取本地 HTML 文件,使用编码检测 - html_content, error = encoding_detection.read_text_file(file_path) - if error: - return None, [f"- {error}"] - - # 清理 HTML + # 步骤 2: 清理 HTML 内容 html_content = cleaner.clean_html_content(html_content) - # 解析 HTML - content, parse_failures = self._parse_html_content(html_content, file_path) - all_failures.extend(parse_failures) + # 步骤 3: 对每个 Parser 创建独立的临时文件并尝试解析 + for parser_name, parser_func in PARSERS: + # 创建临时文件 + fd, temp_file_path = tempfile.mkstemp(suffix='.html', text=True) + try: + # 写入清理后的 HTML 内容(UTF-8 编码) + with os.fdopen(fd, 'w', encoding='utf-8') as f: + f.write(html_content) - return content, all_failures + # 调用 Parser 解析 + content, error = parser_func(temp_file_path) + if content is not None: + return content, all_failures + else: + all_failures.append(f"- {parser_name}: {error}") + finally: + # 清理临时文件 + try: + os.unlink(temp_file_path) + except Exception: + pass + + # 所有 Parser 都失败 + return None, all_failures diff --git a/scripts/readers/html/domscribe.py b/scripts/readers/html/domscribe.py index c88e710..3ee2522 100644 --- a/scripts/readers/html/domscribe.py +++ b/scripts/readers/html/domscribe.py @@ -3,13 +3,21 @@ from typing import Optional, Tuple -def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]: - """使用 domscribe 解析 HTML""" +def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 domscribe 解析 HTML 文件""" try: from domscribe import html_to_markdown except ImportError: return None, "domscribe 库未安装" + try: + with open(file_path, 'r', encoding='utf-8') as f: + html_content = f.read() + except FileNotFoundError: + return None, f"文件不存在: {file_path}" + except Exception as e: + return None, f"读取文件失败: {str(e)}" + try: options = { 'extract_main_content': True, diff --git a/scripts/readers/html/html2text.py b/scripts/readers/html/html2text.py index e22fda8..61764d3 100644 --- a/scripts/readers/html/html2text.py +++ b/scripts/readers/html/html2text.py @@ -3,13 +3,21 @@ from typing import Optional, Tuple -def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]: - """使用 html2text 解析 HTML(兜底方案)""" +def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 html2text 解析 HTML 文件(兜底方案)""" try: import html2text except ImportError: return None, "html2text 库未安装" + try: + with open(file_path, 'r', encoding='utf-8') as f: + html_content = f.read() + except FileNotFoundError: + return None, f"文件不存在: {file_path}" + except Exception as e: + return None, f"读取文件失败: {str(e)}" + try: converter = html2text.HTML2Text() converter.ignore_emphasis = False diff --git a/scripts/readers/html/markitdown.py b/scripts/readers/html/markitdown.py index 026176a..eaf2e90 100644 --- a/scripts/readers/html/markitdown.py +++ b/scripts/readers/html/markitdown.py @@ -1,39 +1,24 @@ """使用 MarkItDown 解析 HTML""" -import os -import tempfile from typing import Optional, Tuple -def parse(html_content: str, temp_file_path: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]: - """使用 MarkItDown 解析 HTML""" +def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 MarkItDown 解析 HTML 文件""" try: from markitdown import MarkItDown except ImportError: return None, "MarkItDown 库未安装" try: - input_path = temp_file_path - if not input_path or not os.path.exists(input_path): - # 创建临时文件 - fd, input_path = tempfile.mkstemp(suffix='.html') - with os.fdopen(fd, 'w', encoding='utf-8') as f: - f.write(html_content) - md = MarkItDown() result = md.convert( - input_path, + file_path, heading_style="ATX", strip=["img", "script", "style", "noscript"], ) markdown_content = result.text_content - if not temp_file_path: - try: - os.unlink(input_path) - except Exception: - pass - if not markdown_content.strip(): return None, "解析内容为空" return markdown_content, None diff --git a/scripts/readers/html/trafilatura.py b/scripts/readers/html/trafilatura.py index 3c3072d..c5ecf09 100644 --- a/scripts/readers/html/trafilatura.py +++ b/scripts/readers/html/trafilatura.py @@ -3,13 +3,21 @@ from typing import Optional, Tuple -def parse(html_content: str) -> Tuple[Optional[str], Optional[str]]: - """使用 trafilatura 解析 HTML""" +def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 trafilatura 解析 HTML 文件""" try: import trafilatura except ImportError: return None, "trafilatura 库未安装" + try: + with open(file_path, 'r', encoding='utf-8') as f: + html_content = f.read() + except FileNotFoundError: + return None, f"文件不存在: {file_path}" + except Exception as e: + return None, f"读取文件失败: {str(e)}" + try: markdown_content = trafilatura.extract( html_content, diff --git a/tests/test_readers/test_html/test_consistency.py b/tests/test_readers/test_html/test_consistency.py index cee85b4..b9d17e2 100644 --- a/tests/test_readers/test_html/test_consistency.py +++ b/tests/test_readers/test_html/test_consistency.py @@ -25,20 +25,16 @@ class TestHtmlReadersConsistency: """) - # 读取 HTML 内容 - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - parsers = [ - ("html2text", lambda c: html2text.parse(c)), - ("markitdown", lambda c: markitdown.parse(c, file_path)), - ("trafilatura", lambda c: trafilatura.parse(c)), - ("domscribe", lambda c: domscribe.parse(c)), + ("html2text", html2text.parse), + ("markitdown", markitdown.parse), + ("trafilatura", trafilatura.parse), + ("domscribe", domscribe.parse), ] successful_results = [] for name, parser in parsers: - content, error = parser(html_content) + content, error = parser(file_path) if content is not None and content.strip(): successful_results.append((name, content)) diff --git a/tests/test_readers/test_html/test_domscribe_html.py b/tests/test_readers/test_html/test_domscribe_html.py index dcfd0fc..9b7923b 100644 --- a/tests/test_readers/test_html/test_domscribe_html.py +++ b/tests/test_readers/test_html/test_domscribe_html.py @@ -10,36 +10,27 @@ class TestDomscribeHtmlReaderParse: def test_normal_file(self, temp_html): """测试正常 HTML 文件解析。""" file_path = temp_html(content="

标题

段落内容

") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = domscribe.parse(html_content) + content, error = domscribe.parse(file_path) if content is not None: assert "标题" in content or "段落" in content def test_file_not_exists(self, tmp_path): """测试文件不存在的情况。""" - html_content = "

测试

" - content, error = domscribe.parse(html_content) - assert content is not None or error is not None + non_existent_path = str(tmp_path / "non_existent.html") + content, error = domscribe.parse(non_existent_path) + assert content is None + # 如果库未安装,也会返回 None,但错误信息不同 + assert error is not None def test_empty_file(self, temp_html): """测试空 HTML 文件。""" file_path = temp_html(content="") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = domscribe.parse(html_content) + content, error = domscribe.parse(file_path) assert content is None or content.strip() == "" - def test_corrupted_file(self, temp_html, tmp_path): - """测试损坏的 HTML 文件。""" - html_content = "\xff\xfe\x00\x00" - content, error = domscribe.parse(html_content) - def test_special_chars(self, temp_html): """测试特殊字符处理。""" file_path = temp_html(content="

中文测试 😀 ©®

") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = domscribe.parse(html_content) + content, error = domscribe.parse(file_path) if content is not None: assert "中文" in content or "测试" in content diff --git a/tests/test_readers/test_html/test_markitdown_html.py b/tests/test_readers/test_html/test_markitdown_html.py index eb8d1b1..9592a7b 100644 --- a/tests/test_readers/test_html/test_markitdown_html.py +++ b/tests/test_readers/test_html/test_markitdown_html.py @@ -10,38 +10,26 @@ class TestMarkitdownHtmlReaderParse: def test_normal_file(self, temp_html): """测试正常 HTML 文件解析。""" file_path = temp_html(content="

标题

段落内容

") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = markitdown.parse(html_content, file_path) + content, error = markitdown.parse(file_path) if content is not None: assert "标题" in content or "段落" in content def test_file_not_exists(self, tmp_path): """测试文件不存在的情况。""" - html_content = "

测试

" - content, error = markitdown.parse(html_content, None) - # markitdown 应该能解析内容 - assert content is not None or error is not None + non_existent_path = str(tmp_path / "non_existent.html") + content, error = markitdown.parse(non_existent_path) + # markitdown 库自己会处理文件不存在的情况 + assert content is None or error is not None def test_empty_file(self, temp_html): """测试空 HTML 文件。""" file_path = temp_html(content="") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = markitdown.parse(html_content, file_path) + content, error = markitdown.parse(file_path) assert content is None or content.strip() == "" - def test_corrupted_file(self, temp_html, tmp_path): - """测试损坏的 HTML 文件。""" - html_content = "\xff\xfe\x00\x00" - content, error = markitdown.parse(html_content, None) - # HTML 解析器通常比较宽容,可能仍能解析 - def test_special_chars(self, temp_html): """测试特殊字符处理。""" file_path = temp_html(content="

中文测试 😀 ©®

") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = markitdown.parse(html_content, file_path) + content, error = markitdown.parse(file_path) if content is not None: assert "中文" in content or "测试" in content diff --git a/tests/test_readers/test_html/test_trafilatura_html.py b/tests/test_readers/test_html/test_trafilatura_html.py index d986e30..12de5ed 100644 --- a/tests/test_readers/test_html/test_trafilatura_html.py +++ b/tests/test_readers/test_html/test_trafilatura_html.py @@ -10,36 +10,27 @@ class TestTrafilaturaHtmlReaderParse: def test_normal_file(self, temp_html): """测试正常 HTML 文件解析。""" file_path = temp_html(content="

标题

段落内容

") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = trafilatura.parse(html_content) + content, error = trafilatura.parse(file_path) if content is not None: assert "标题" in content or "段落" in content def test_file_not_exists(self, tmp_path): """测试文件不存在的情况。""" - html_content = "

测试

" - content, error = trafilatura.parse(html_content) - assert content is not None or error is not None + non_existent_path = str(tmp_path / "non_existent.html") + content, error = trafilatura.parse(non_existent_path) + assert content is None + # 如果库未安装,也会返回 None,但错误信息不同 + assert error is not None def test_empty_file(self, temp_html): """测试空 HTML 文件。""" file_path = temp_html(content="") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = trafilatura.parse(html_content) + content, error = trafilatura.parse(file_path) assert content is None or content.strip() == "" - def test_corrupted_file(self, temp_html, tmp_path): - """测试损坏的 HTML 文件。""" - html_content = "\xff\xfe\x00\x00" - content, error = trafilatura.parse(html_content) - def test_special_chars(self, temp_html): """测试特殊字符处理。""" file_path = temp_html(content="

中文测试 😀 ©®

") - with open(file_path, 'r', encoding='utf-8') as f: - html_content = f.read() - content, error = trafilatura.parse(html_content) + content, error = trafilatura.parse(file_path) if content is not None: assert "中文" in content or "测试" in content