"""测试 html2text Reader 的解析功能。""" import pytest import os from scripts.readers.html import HtmlReader class TestHtml2TextReaderParse: """测试 html2text Reader 的 parse 方法。""" def test_normal_file(self, temp_html): """测试正常 HTML 文件解析。""" html_content = """

主标题

这是一段测试内容。

子标题

列表项1
列表项2

单元格1

单元格2

""" file_path = temp_html(content=html_content) reader = HtmlReader() content, failures = reader.parse(file_path) # 验证解析成功 assert content is not None, f"解析失败: {failures}" # 验证关键内容存在 assert "主标题" in content assert "测试内容" in content assert "子标题" in content assert "列表项1" in content def test_file_not_exists(self, tmp_path): """测试文件不存在的情况。""" non_existent_file = str(tmp_path / "non_existent.html") reader = HtmlReader() content, failures = reader.parse(non_existent_file) # 验证返回 None 和错误信息 assert content is None assert len(failures) > 0 assert any("不存在" in f or "找不到" in f for f in failures) def test_empty_file(self, temp_html): """测试空 HTML 文件。""" file_path = temp_html(content="") reader = HtmlReader() content, failures = reader.parse(file_path) # 空文件应该返回 None 或空字符串 assert content is None or content.strip() == "" def test_corrupted_file(self, temp_html): """测试损坏的 HTML 文件。""" # HTML 解析器通常比较宽容，但我们可以测试完全无效的内容 file_path = temp_html(content="<<>>invalid<>") reader = HtmlReader() content, failures = reader.parse(file_path) # HTML 解析器可能仍然能解析，或返回错误 # 这里只验证不会崩溃 def test_special_chars(self, temp_html): """测试特殊字符处理。""" html_content = """

中文测试内容

Emoji测试: 😀🎉🚀

特殊符号: ©®™°±

混合内容: Hello你好🎉World世界

""" file_path = temp_html(content=html_content) reader = HtmlReader() content, failures = reader.parse(file_path) assert content is not None, f"解析失败: {failures}" # 验证各种特殊字符都被正确处理 assert "中文测试内容" in content assert "Hello你好" in content or "World世界" in content def test_encoding_gbk(self, temp_html): """测试 GBK 编码的 HTML 文件。""" html_content = "

中文内容

" file_path = temp_html(content=html_content, encoding='gbk') reader = HtmlReader() content, failures = reader.parse(file_path) # 验证能够正确处理 GBK 编码 # 注意：某些 Reader 可能无法自动检测编码 if content: assert len(content.strip()) > 0 def test_encoding_utf8_bom(self, temp_html, tmp_path): """测试 UTF-8 BOM 的 HTML 文件。""" html_content = "

测试内容

" file_path = tmp_path / "test_bom.html" # 写入带 BOM 的 UTF-8 文件 with open(file_path, 'wb') as f: f.write(b'\xef\xbb\xbf') # UTF-8 BOM f.write(html_content.encode('utf-8')) reader = HtmlReader() content, failures = reader.parse(str(file_path)) # 验证能够正确处理 UTF-8 BOM if content: assert "测试内容" in content class TestHtml2TextReaderSupports: """测试 html2text Reader 的 supports 方法。""" def test_supports_html_extension(self): """测试识别 .html 扩展名。""" reader = HtmlReader() assert reader.supports("test.html") is True def test_supports_htm_extension(self): """测试识别 .htm 扩展名。""" reader = HtmlReader() assert reader.supports("test.htm") is True def test_supports_uppercase_extension(self): """测试识别大写扩展名。""" reader = HtmlReader() assert reader.supports("TEST.HTML") is True def test_supports_url(self): """测试 URL。""" reader = HtmlReader() # HTML Reader 通常支持 URL result = reader.supports("http://example.com/page.html") # 根据实际实现可能返回 True def test_rejects_unsupported_format(self): """测试拒绝不支持的格式。""" reader = HtmlReader() assert reader.supports("test.pdf") is False assert reader.supports("test.docx") is False