Files
lyxy-document/tests/test_readers/test_html/test_trafilatura_html.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

37 lines
1.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""测试 Trafilatura HTML Reader 的解析功能。"""
import pytest
from readers.html import trafilatura
class TestTrafilaturaHtmlReaderParse:
"""测试 Trafilatura HTML Reader 的 parse 方法。"""
def test_normal_file(self, temp_html):
"""测试正常 HTML 文件解析。"""
file_path = temp_html(content="<h1>标题</h1><p>段落内容</p>")
content, error = trafilatura.parse(file_path)
if content is not None:
assert "标题" in content or "段落" in content
def test_file_not_exists(self, tmp_path):
"""测试文件不存在的情况。"""
non_existent_path = str(tmp_path / "non_existent.html")
content, error = trafilatura.parse(non_existent_path)
assert content is None
# 如果库未安装,也会返回 None但错误信息不同
assert error is not None
def test_empty_file(self, temp_html):
"""测试空 HTML 文件。"""
file_path = temp_html(content="<html><body></body></html>")
content, error = trafilatura.parse(file_path)
assert content is None or content.strip() == ""
def test_special_chars(self, temp_html):
"""测试特殊字符处理。"""
file_path = temp_html(content="<p>中文测试 😀 ©®</p>")
content, error = trafilatura.parse(file_path)
if content is not None:
assert "中文" in content or "测试" in content