refactor: 调整模块导入路径,简化引用结构

- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
This commit is contained in:
2026-03-09 15:44:51 +08:00
parent 6e75c99d5b
commit 9daff73589
63 changed files with 103 additions and 76 deletions

View File

@@ -9,7 +9,7 @@ context: |
- 开发文档: README.md,每次迭代按需更新开发文档; 禁emoji/特殊字符
- skill文档: SKILL.md,每次迭代按需更新skill文档
- 测试: 所有需求必须设计全面测试
- 任务: 禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
- 任务: 除非用户直接要求,禁止创建git变更任务(push/commit等); git读取允许(status/log/diff等)
- 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文
- 项目阶段: 未上线,无用户,破坏性变更无需迁移说明
- Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明

View File

@@ -4,12 +4,12 @@ import argparse
import sys
from typing import List, Optional, Tuple
from scripts.core.exceptions import FileDetectionError, ReaderNotFoundError
from scripts.core.markdown import (
from core.exceptions import FileDetectionError, ReaderNotFoundError
from core.markdown import (
normalize_markdown_whitespace,
remove_markdown_images,
)
from scripts.readers import BaseReader
from readers import BaseReader
def parse_input(

View File

@@ -6,12 +6,12 @@ import logging
import os
import sys
import warnings
from pathlib import Path
# 将项目根目录添加到 sys.path支持从任意位置执行脚本
_current_dir = os.path.dirname(os.path.abspath(__file__))
_project_root = os.path.dirname(_current_dir)
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
# 将 scripts/ 目录添加到 sys.path支持从任意位置执行脚本
scripts_dir = Path(__file__).resolve().parent
if str(scripts_dir) not in sys.path:
sys.path.append(str(scripts_dir))
# 抑制第三方库的进度条和日志,仅保留解析结果输出
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
@@ -26,14 +26,14 @@ logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
logging.getLogger('docling').setLevel(logging.ERROR)
logging.getLogger('unstructured').setLevel(logging.ERROR)
from scripts.core import (
from core import (
FileDetectionError,
ReaderNotFoundError,
output_result,
parse_input,
process_content,
)
from scripts.readers import READERS
from readers import READERS
def main() -> None:

View File

@@ -3,8 +3,8 @@
import os
from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader
from scripts.utils import is_valid_docx
from readers.base import BaseReader
from utils import is_valid_docx
from . import docling
from . import unstructured

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_docling
from readers._utils import parse_via_docling
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown
from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
import zipfile
from typing import Any, Dict, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, safe_open_zip
from readers._utils import build_markdown_table, safe_open_zip
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Any, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table
from readers._utils import build_markdown_table
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown
from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -4,9 +4,9 @@ import os
import tempfile
from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader
from scripts.utils import is_url
from scripts.utils import encoding_detection
from readers.base import BaseReader
from utils import is_url
from utils import encoding_detection
from . import cleaner
from .downloader import download_html

View File

@@ -3,8 +3,8 @@
import os
from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader
from scripts.utils import is_valid_pdf
from readers.base import BaseReader
from utils import is_valid_pdf
from . import docling_ocr
from . import unstructured_ocr

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown
from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown
from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown
from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -3,8 +3,8 @@
import os
from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader
from scripts.utils import is_valid_pptx
from readers.base import BaseReader
from utils import is_valid_pptx
from . import docling
from . import unstructured

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_docling
from readers._utils import parse_via_docling
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown
from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
import zipfile
from typing import Any, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, flush_list_stack
from readers._utils import build_markdown_table, flush_list_stack
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Any, List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, flush_list_stack
from readers._utils import build_markdown_table, flush_list_stack
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown
from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -3,8 +3,8 @@
import os
from typing import List, Optional, Tuple
from scripts.readers.base import BaseReader
from scripts.utils import is_valid_xlsx
from readers.base import BaseReader
from utils import is_valid_xlsx
from . import docling
from . import unstructured

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_docling
from readers._utils import parse_via_docling
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import parse_via_markitdown
from readers._utils import parse_via_markitdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
import zipfile
from typing import List, Optional, Tuple
from scripts.readers._utils import build_markdown_table, safe_open_zip
from readers._utils import build_markdown_table, safe_open_zip
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.readers._utils import convert_unstructured_to_markdown
from readers._utils import convert_unstructured_to_markdown
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -2,7 +2,7 @@
from typing import Optional, Tuple
from scripts.config import Config
from config import Config
def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]:

View File

@@ -1 +1,12 @@
"""Tests package for lyxy-document."""
import sys
from pathlib import Path
# 将 scripts/ 目录添加到 sys.path
project_root = Path(__file__).resolve().parent.parent
scripts_dir = project_root / "scripts"
if str(scripts_dir) not in sys.path:
sys.path.insert(0, str(scripts_dir))

View File

@@ -1,7 +1,16 @@
"""测试配置和共享 fixtures。"""
import sys
from pathlib import Path
# 将 scripts/ 目录添加到 sys.path必须在最顶部在其他导入之前
project_root = Path(__file__).resolve().parent.parent # tests/ 的父目录是项目根目录
scripts_dir = project_root / "scripts"
if str(scripts_dir) not in sys.path:
sys.path.insert(0, str(scripts_dir))
import pytest
from scripts.readers import READERS
from readers import READERS
@pytest.fixture

View File

@@ -2,6 +2,7 @@
import pytest
import sys
from pathlib import Path
from io import StringIO
from contextlib import redirect_stdout, redirect_stderr
@@ -22,7 +23,13 @@ def cli_runner():
Returns:
tuple: (stdout, stderr, exit_code)
"""
from scripts.lyxy_document_reader import main
# 将 scripts/ 目录添加到 sys.path
project_root = Path(__file__).resolve().parent.parent.parent # tests/test_cli/ 的父目录是 tests/,再父目录是项目根目录
scripts_dir = project_root / "scripts"
if str(scripts_dir) not in sys.path:
sys.path.insert(0, str(scripts_dir))
from lyxy_document_reader import main
# 保存原始 sys.argv 和 sys.exit
original_argv = sys.argv

View File

@@ -1,6 +1,6 @@
"""测试 Markdown 工具函数。"""
from scripts.core import (
from core import (
get_heading_level,
extract_titles,
normalize_markdown_whitespace,

View File

@@ -1,7 +1,7 @@
"""测试所有 DOCX Readers 的一致性。"""
import pytest
from scripts.readers.docx import (
from readers.docx import (
docling,
unstructured,
pypandoc,

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.docx import docling
from readers.docx import docling
class TestDoclingDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.docx import markitdown
from readers.docx import markitdown
class TestMarkitdownDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.docx import native_xml
from readers.docx import native_xml
class TestNativeXmlDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.docx import pypandoc
from readers.docx import pypandoc
class TestPypandocDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.docx import DocxReader
from readers.docx import DocxReader
class TestPythonDocxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.docx import unstructured
from readers.docx import unstructured
class TestUnstructuredDocxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试所有 HTML Readers 的一致性。"""
import pytest
from scripts.readers.html import (
from readers.html import (
html2text,
markitdown,
trafilatura,

View File

@@ -1,7 +1,7 @@
"""测试 Domscribe HTML Reader 的解析功能。"""
import pytest
from scripts.readers.html import domscribe
from readers.html import domscribe
class TestDomscribeHtmlReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.html import HtmlReader
from readers.html import HtmlReader
class TestHtml2TextReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown HTML Reader 的解析功能。"""
import pytest
from scripts.readers.html import markitdown
from readers.html import markitdown
class TestMarkitdownHtmlReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Trafilatura HTML Reader 的解析功能。"""
import pytest
from scripts.readers.html import trafilatura
from readers.html import trafilatura
class TestTrafilaturaHtmlReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试所有 PDF Readers 的一致性。"""
import pytest
from scripts.readers.pdf import (
from readers.pdf import (
docling,
docling_ocr,
markitdown,

View File

@@ -1,7 +1,7 @@
"""测试 Docling OCR PDF Reader 的解析功能。"""
import pytest
from scripts.readers.pdf import docling_ocr
from readers.pdf import docling_ocr
class TestDoclingOcrPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Docling PDF Reader 的解析功能。"""
import pytest
from scripts.readers.pdf import docling
from readers.pdf import docling
class TestDoclingPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown PDF Reader 的解析功能。"""
import pytest
from scripts.readers.pdf import markitdown
from readers.pdf import markitdown
class TestMarkitdownPdfReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.pdf import PdfReader
from readers.pdf import PdfReader
class TestPypdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured OCR PDF Reader 的解析功能。"""
import pytest
from scripts.readers.pdf import unstructured_ocr
from readers.pdf import unstructured_ocr
class TestUnstructuredOcrPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured PDF Reader 的解析功能。"""
import pytest
from scripts.readers.pdf import unstructured
from readers.pdf import unstructured
class TestUnstructuredPdfReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试所有 PPTX Readers 的一致性。"""
import pytest
from scripts.readers.pptx import (
from readers.pptx import (
docling,
markitdown,
native_xml,

View File

@@ -1,7 +1,7 @@
"""测试 Docling PPTX Reader 的解析功能。"""
import pytest
from scripts.readers.pptx import docling
from readers.pptx import docling
class TestDoclingPptxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown PPTX Reader 的解析功能。"""
import pytest
from scripts.readers.pptx import markitdown
from readers.pptx import markitdown
class TestMarkitdownPptxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Native XML PPTX Reader 的解析功能。"""
import pytest
from scripts.readers.pptx import native_xml
from readers.pptx import native_xml
class TestNativeXmlPptxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.pptx import PptxReader
from readers.pptx import PptxReader
class TestPythonPptxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured PPTX Reader 的解析功能。"""
import pytest
from scripts.readers.pptx import unstructured
from readers.pptx import unstructured
class TestUnstructuredPptxReaderParse:

View File

@@ -2,7 +2,7 @@
import zipfile
import pytest
from scripts.readers._utils import (
from readers._utils import (
parse_via_markitdown,
parse_via_docling,
build_markdown_table,

View File

@@ -1,7 +1,7 @@
"""测试所有 XLSX Readers 的一致性。"""
import pytest
from scripts.readers.xlsx import (
from readers.xlsx import (
docling,
markitdown,
native_xml,

View File

@@ -1,7 +1,7 @@
"""测试 Docling XLSX Reader 的解析功能。"""
import pytest
from scripts.readers.xlsx import docling
from readers.xlsx import docling
class TestDoclingXlsxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 MarkItDown XLSX Reader 的解析功能。"""
import pytest
from scripts.readers.xlsx import markitdown
from readers.xlsx import markitdown
class TestMarkitdownXlsxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Native XML XLSX Reader 的解析功能。"""
import pytest
from scripts.readers.xlsx import native_xml
from readers.xlsx import native_xml
class TestNativeXmlXlsxReaderParse:

View File

@@ -2,7 +2,7 @@
import pytest
import os
from scripts.readers.xlsx import XlsxReader
from readers.xlsx import XlsxReader
class TestPandasXlsxReaderParse:

View File

@@ -1,7 +1,7 @@
"""测试 Unstructured XLSX Reader 的解析功能。"""
import pytest
from scripts.readers.xlsx import unstructured
from readers.xlsx import unstructured
class TestUnstructuredXlsxReaderParse:

View File

@@ -1,6 +1,6 @@
"""测试文件检测工具函数。"""
from scripts.utils import is_url, is_html_file
from utils import is_url, is_html_file
class TestIsUrl: