"""测试 Trafilatura HTML Reader 的解析功能。""" import pytest from scripts.readers.html import trafilatura class TestTrafilaturaHtmlReaderParse: """测试 Trafilatura HTML Reader 的 parse 方法。""" def test_normal_file(self, temp_html): """测试正常 HTML 文件解析。""" file_path = temp_html(content="

标题

段落内容

") with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() content, error = trafilatura.parse(html_content) if content is not None: assert "标题" in content or "段落" in content def test_file_not_exists(self, tmp_path): """测试文件不存在的情况。""" html_content = "

测试

" content, error = trafilatura.parse(html_content) assert content is not None or error is not None def test_empty_file(self, temp_html): """测试空 HTML 文件。""" file_path = temp_html(content="") with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() content, error = trafilatura.parse(html_content) assert content is None or content.strip() == "" def test_corrupted_file(self, temp_html, tmp_path): """测试损坏的 HTML 文件。""" html_content = "\xff\xfe\x00\x00" content, error = trafilatura.parse(html_content) def test_special_chars(self, temp_html): """测试特殊字符处理。""" file_path = temp_html(content="

中文测试 😀 ©®

") with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() content, error = trafilatura.parse(html_content) if content is not None: assert "中文" in content or "测试" in content