"""测试 html2text Reader 的解析功能。""" import pytest import os from scripts.readers.html import HtmlReader class TestHtml2TextReaderParse: """测试 html2text Reader 的 parse 方法。""" def test_normal_file(self, temp_html): """测试正常 HTML 文件解析。""" html_content = """
这是一段测试内容。
| 单元格1 | 单元格2 |
中文测试内容
Emoji测试: 😀🎉🚀
特殊符号: ©®™°±
混合内容: Hello你好🎉World世界
""" file_path = temp_html(content=html_content) reader = HtmlReader() content, failures = reader.parse(file_path) assert content is not None, f"解析失败: {failures}" # 验证各种特殊字符都被正确处理 assert "中文测试内容" in content assert "Hello你好" in content or "World世界" in content def test_encoding_gbk(self, temp_html): """测试 GBK 编码的 HTML 文件。""" html_content = "中文内容
" file_path = temp_html(content=html_content, encoding='gbk') reader = HtmlReader() content, failures = reader.parse(file_path) # 验证能够正确处理 GBK 编码 # 注意:某些 Reader 可能无法自动检测编码 if content: assert len(content.strip()) > 0 def test_encoding_utf8_bom(self, temp_html, tmp_path): """测试 UTF-8 BOM 的 HTML 文件。""" html_content = "测试内容
" file_path = tmp_path / "test_bom.html" # 写入带 BOM 的 UTF-8 文件 with open(file_path, 'wb') as f: f.write(b'\xef\xbb\xbf') # UTF-8 BOM f.write(html_content.encode('utf-8')) reader = HtmlReader() content, failures = reader.parse(str(file_path)) # 验证能够正确处理 UTF-8 BOM if content: assert "测试内容" in content class TestHtml2TextReaderSupports: """测试 html2text Reader 的 supports 方法。""" def test_supports_html_extension(self): """测试识别 .html 扩展名。""" reader = HtmlReader() assert reader.supports("test.html") is True def test_supports_htm_extension(self): """测试识别 .htm 扩展名。""" reader = HtmlReader() assert reader.supports("test.htm") is True def test_supports_uppercase_extension(self): """测试识别大写扩展名。""" reader = HtmlReader() assert reader.supports("TEST.HTML") is True def test_supports_url(self): """测试 URL。""" reader = HtmlReader() # HTML Reader 通常支持 URL result = reader.supports("http://example.com/page.html") # 根据实际实现可能返回 True def test_rejects_unsupported_format(self): """测试拒绝不支持的格式。""" reader = HtmlReader() assert reader.supports("test.pdf") is False assert reader.supports("test.docx") is False