refactor: 调整模块导入路径,简化引用结构

- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
This commit is contained in:
2026-03-09 15:44:51 +08:00
parent 6e75c99d5b
commit 9daff73589
63 changed files with 103 additions and 76 deletions

View File

@@ -9,7 +9,7 @@ context: |
- 开发文档: README.md,每次迭代按需更新开发文档; 禁emoji/特殊字符 - 开发文档: README.md,每次迭代按需更新开发文档; 禁emoji/特殊字符
- skill文档: SKILL.md,每次迭代按需更新skill文档 - skill文档: SKILL.md,每次迭代按需更新skill文档
- 测试: 所有需求必须设计全面测试 - 测试: 所有需求必须设计全面测试
- 任务: 禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等) - 任务: 除非用户直接要求,禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
- 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文 - 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文
- 项目阶段: 未上线,无用户,破坏性变更无需迁移说明 - 项目阶段: 未上线,无用户,破坏性变更无需迁移说明
- Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明 - Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明

View File

@@ -4,12 +4,12 @@ import argparse
import sys import sys
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.core.exceptions import FileDetectionError, ReaderNotFoundError from core.exceptions import FileDetectionError, ReaderNotFoundError
from scripts.core.markdown import ( from core.markdown import (
normalize_markdown_whitespace, normalize_markdown_whitespace,
remove_markdown_images, remove_markdown_images,
) )
from scripts.readers import BaseReader from readers import BaseReader
def parse_input( def parse_input(

View File

@@ -6,12 +6,12 @@ import logging
import os import os
import sys import sys
import warnings import warnings
from pathlib import Path
# 将项目根目录添加到 sys.path支持从任意位置执行脚本 # 将 scripts/ 目录添加到 sys.path支持从任意位置执行脚本
_current_dir = os.path.dirname(os.path.abspath(__file__)) scripts_dir = Path(__file__).resolve().parent
_project_root = os.path.dirname(_current_dir) if str(scripts_dir) not in sys.path:
if _project_root not in sys.path: sys.path.append(str(scripts_dir))
sys.path.insert(0, _project_root)
# 抑制第三方库的进度条和日志,仅保留解析结果输出 # 抑制第三方库的进度条和日志,仅保留解析结果输出
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
@@ -26,14 +26,14 @@ logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
logging.getLogger('docling').setLevel(logging.ERROR) logging.getLogger('docling').setLevel(logging.ERROR)
logging.getLogger('unstructured').setLevel(logging.ERROR) logging.getLogger('unstructured').setLevel(logging.ERROR)
from scripts.core import ( from core import (
FileDetectionError, FileDetectionError,
ReaderNotFoundError, ReaderNotFoundError,
output_result, output_result,
parse_input, parse_input,
process_content, process_content,
) )
from scripts.readers import READERS from readers import READERS
def main() -> None: def main() -> None:

View File

@@ -3,8 +3,8 @@
import os import os
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader from readers.base import BaseReader
from scripts.utils import is_valid_docx from utils import is_valid_docx
from . import docling from . import docling
from . import unstructured from . import unstructured

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_docling from readers._utils import parse_via_docling
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
import zipfile import zipfile
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, safe_open_zip from readers._utils import build_markdown_table, safe_open_zip
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Any, List, Optional, Tuple from typing import Any, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table from readers._utils import build_markdown_table
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -4,9 +4,9 @@ import os
import tempfile import tempfile
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader from readers.base import BaseReader
from scripts.utils import is_url from utils import is_url
from scripts.utils import encoding_detection from utils import encoding_detection
from . import cleaner from . import cleaner
from .downloader import download_html from .downloader import download_html

View File

@@ -3,8 +3,8 @@
import os import os
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader from readers.base import BaseReader
from scripts.utils import is_valid_pdf from utils import is_valid_pdf
from . import docling_ocr from . import docling_ocr
from . import unstructured_ocr from . import unstructured_ocr

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -3,8 +3,8 @@
import os import os
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader from readers.base import BaseReader
from scripts.utils import is_valid_pptx from utils import is_valid_pptx
from . import docling from . import docling
from . import unstructured from . import unstructured

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_docling from readers._utils import parse_via_docling
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
import zipfile import zipfile
from typing import Any, List, Optional, Tuple from typing import Any, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, flush_list_stack from readers._utils import build_markdown_table, flush_list_stack
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Any, List, Optional, Tuple from typing import Any, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, flush_list_stack from readers._utils import build_markdown_table, flush_list_stack
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -3,8 +3,8 @@
import os import os
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader from readers.base import BaseReader
from scripts.utils import is_valid_xlsx from utils import is_valid_xlsx
from . import docling from . import docling
from . import unstructured from . import unstructured

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_docling from readers._utils import parse_via_docling
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
import zipfile import zipfile
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, safe_open_zip from readers._utils import build_markdown_table, safe_open_zip
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from scripts.config import Config from config import Config
def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]: def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -1 +1,12 @@
"""Tests package for lyxy-document.""" """Tests package for lyxy-document."""
import sys
from pathlib import Path
# 将 scripts/ 目录添加到 sys.path
project_root = Path(__file__).resolve().parent.parent
scripts_dir = project_root / "scripts"
if str(scripts_dir) not in sys.path:
sys.path.insert(0, str(scripts_dir))

View File

@@ -1,7 +1,16 @@
"""测试配置和共享 fixtures。""" """测试配置和共享 fixtures。"""
import sys
from pathlib import Path
# 将 scripts/ 目录添加到 sys.path必须在最顶部在其他导入之前
project_root = Path(__file__).resolve().parent.parent # tests/ 的父目录是项目根目录
scripts_dir = project_root / "scripts"
if str(scripts_dir) not in sys.path:
sys.path.insert(0, str(scripts_dir))
import pytest import pytest
from scripts.readers import READERS from readers import READERS
@pytest.fixture @pytest.fixture

View File

@@ -2,6 +2,7 @@
import pytest import pytest
import sys import sys
from pathlib import Path
from io import StringIO from io import StringIO
from contextlib import redirect_stdout, redirect_stderr from contextlib import redirect_stdout, redirect_stderr
@@ -22,7 +23,13 @@ def cli_runner():
Returns: Returns:
tuple: (stdout, stderr, exit_code) tuple: (stdout, stderr, exit_code)
""" """
from scripts.lyxy_document_reader import main # 将 scripts/ 目录添加到 sys.path
project_root = Path(__file__).resolve().parent.parent.parent # tests/test_cli/ 的父目录是 tests/,再父目录是项目根目录
scripts_dir = project_root / "scripts"
if str(scripts_dir) not in sys.path:
sys.path.insert(0, str(scripts_dir))
from lyxy_document_reader import main
# 保存原始 sys.argv 和 sys.exit # 保存原始 sys.argv 和 sys.exit
original_argv = sys.argv original_argv = sys.argv

View File

@@ -1,6 +1,6 @@
"""测试 Markdown 工具函数。""" """测试 Markdown 工具函数。"""
from scripts.core import ( from core import (
get_heading_level, get_heading_level,
extract_titles, extract_titles,
normalize_markdown_whitespace, normalize_markdown_whitespace,

View File

@@ -1,7 +1,7 @@
"""测试所有 DOCX Readers 的一致性。""" """测试所有 DOCX Readers 的一致性。"""
import pytest import pytest
from scripts.readers.docx import ( from readers.docx import (
docling, docling,
unstructured, unstructured,
pypandoc, pypandoc,

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.docx import docling from readers.docx import docling
class TestDoclingDocxReaderParse: class TestDoclingDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.docx import markitdown from readers.docx import markitdown
class TestMarkitdownDocxReaderParse: class TestMarkitdownDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.docx import native_xml from readers.docx import native_xml
class TestNativeXmlDocxReaderParse: class TestNativeXmlDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.docx import pypandoc from readers.docx import pypandoc
class TestPypandocDocxReaderParse: class TestPypandocDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.docx import DocxReader from readers.docx import DocxReader
class TestPythonDocxReaderParse: class TestPythonDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.docx import unstructured from readers.docx import unstructured
class TestUnstructuredDocxReaderParse: class TestUnstructuredDocxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试所有 HTML Readers 的一致性。""" """测试所有 HTML Readers 的一致性。"""
import pytest import pytest
from scripts.readers.html import ( from readers.html import (
html2text, html2text,
markitdown, markitdown,
trafilatura, trafilatura,

View File

@@ -1,7 +1,7 @@
"""测试 Domscribe HTML Reader 的解析功能。""" """测试 Domscribe HTML Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.html import domscribe from readers.html import domscribe
class TestDomscribeHtmlReaderParse: class TestDomscribeHtmlReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.html import HtmlReader from readers.html import HtmlReader
class TestHtml2TextReaderParse: class TestHtml2TextReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown HTML Reader 的解析功能。""" """测试 MarkItDown HTML Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.html import markitdown from readers.html import markitdown
class TestMarkitdownHtmlReaderParse: class TestMarkitdownHtmlReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Trafilatura HTML Reader 的解析功能。""" """测试 Trafilatura HTML Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.html import trafilatura from readers.html import trafilatura
class TestTrafilaturaHtmlReaderParse: class TestTrafilaturaHtmlReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试所有 PDF Readers 的一致性。""" """测试所有 PDF Readers 的一致性。"""
import pytest import pytest
from scripts.readers.pdf import ( from readers.pdf import (
docling, docling,
docling_ocr, docling_ocr,
markitdown, markitdown,

View File

@@ -1,7 +1,7 @@
"""测试 Docling OCR PDF Reader 的解析功能。""" """测试 Docling OCR PDF Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pdf import docling_ocr from readers.pdf import docling_ocr
class TestDoclingOcrPdfReaderParse: class TestDoclingOcrPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Docling PDF Reader 的解析功能。""" """测试 Docling PDF Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pdf import docling from readers.pdf import docling
class TestDoclingPdfReaderParse: class TestDoclingPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown PDF Reader 的解析功能。""" """测试 MarkItDown PDF Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pdf import markitdown from readers.pdf import markitdown
class TestMarkitdownPdfReaderParse: class TestMarkitdownPdfReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.pdf import PdfReader from readers.pdf import PdfReader
class TestPypdfReaderParse: class TestPypdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured OCR PDF Reader 的解析功能。""" """测试 Unstructured OCR PDF Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pdf import unstructured_ocr from readers.pdf import unstructured_ocr
class TestUnstructuredOcrPdfReaderParse: class TestUnstructuredOcrPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured PDF Reader 的解析功能。""" """测试 Unstructured PDF Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pdf import unstructured from readers.pdf import unstructured
class TestUnstructuredPdfReaderParse: class TestUnstructuredPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试所有 PPTX Readers 的一致性。""" """测试所有 PPTX Readers 的一致性。"""
import pytest import pytest
from scripts.readers.pptx import ( from readers.pptx import (
docling, docling,
markitdown, markitdown,
native_xml, native_xml,

View File

@@ -1,7 +1,7 @@
"""测试 Docling PPTX Reader 的解析功能。""" """测试 Docling PPTX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pptx import docling from readers.pptx import docling
class TestDoclingPptxReaderParse: class TestDoclingPptxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown PPTX Reader 的解析功能。""" """测试 MarkItDown PPTX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pptx import markitdown from readers.pptx import markitdown
class TestMarkitdownPptxReaderParse: class TestMarkitdownPptxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Native XML PPTX Reader 的解析功能。""" """测试 Native XML PPTX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pptx import native_xml from readers.pptx import native_xml
class TestNativeXmlPptxReaderParse: class TestNativeXmlPptxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.pptx import PptxReader from readers.pptx import PptxReader
class TestPythonPptxReaderParse: class TestPythonPptxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured PPTX Reader 的解析功能。""" """测试 Unstructured PPTX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.pptx import unstructured from readers.pptx import unstructured
class TestUnstructuredPptxReaderParse: class TestUnstructuredPptxReaderParse:

View File

@@ -2,7 +2,7 @@
import zipfile import zipfile
import pytest import pytest
from scripts.readers._utils import ( from readers._utils import (
parse_via_markitdown, parse_via_markitdown,
parse_via_docling, parse_via_docling,
build_markdown_table, build_markdown_table,

View File

@@ -1,7 +1,7 @@
"""测试所有 XLSX Readers 的一致性。""" """测试所有 XLSX Readers 的一致性。"""
import pytest import pytest
from scripts.readers.xlsx import ( from readers.xlsx import (
docling, docling,
markitdown, markitdown,
native_xml, native_xml,

View File

@@ -1,7 +1,7 @@
"""测试 Docling XLSX Reader 的解析功能。""" """测试 Docling XLSX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.xlsx import docling from readers.xlsx import docling
class TestDoclingXlsxReaderParse: class TestDoclingXlsxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown XLSX Reader 的解析功能。""" """测试 MarkItDown XLSX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.xlsx import markitdown from readers.xlsx import markitdown
class TestMarkitdownXlsxReaderParse: class TestMarkitdownXlsxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Native XML XLSX Reader 的解析功能。""" """测试 Native XML XLSX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.xlsx import native_xml from readers.xlsx import native_xml
class TestNativeXmlXlsxReaderParse: class TestNativeXmlXlsxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest import pytest
import os import os
from scripts.readers.xlsx import XlsxReader from readers.xlsx import XlsxReader
class TestPandasXlsxReaderParse: class TestPandasXlsxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured XLSX Reader 的解析功能。""" """测试 Unstructured XLSX Reader 的解析功能。"""
import pytest import pytest
from scripts.readers.xlsx import unstructured from readers.xlsx import unstructured
class TestUnstructuredXlsxReaderParse: class TestUnstructuredXlsxReaderParse:

View File

@@ -1,6 +1,6 @@
"""测试文件检测工具函数。""" """测试文件检测工具函数。"""
from scripts.utils import is_url, is_html_file from utils import is_url, is_html_file
class TestIsUrl: class TestIsUrl: