diff --git a/README.md b/README.md index 73eecff..9cc2879 100644 --- a/README.md +++ b/README.md @@ -108,21 +108,26 @@ else: ``` lyxy-document/ -├── lyxy_document_reader.py # 统一 CLI 入口 -├── core/ # 核心模块 -│ ├── exceptions.py # 自定义异常体系 -│ ├── markdown.py # Markdown 工具函数 -│ └── parser.py # 统一解析调度器 -├── readers/ # 格式阅读器 -│ ├── base.py # Reader 基类 -│ ├── docx/ # DOCX 阅读器 -│ ├── xlsx/ # XLSX 阅读器 -│ ├── pptx/ # PPTX 阅读器 -│ ├── pdf/ # PDF 阅读器 -│ └── html/ # HTML/URL 阅读器 -├── utils/ # 工具函数 -│ └── file_detection.py # 文件类型检测 -└── tests/ # 测试 +├── scripts/ # 核心代码目录 +│ ├── lyxy_document_reader.py # 统一 CLI 入口 +│ ├── config.py # 统一配置类 +│ ├── core/ # 核心模块 +│ │ ├── exceptions.py # 自定义异常体系 +│ │ ├── markdown.py # Markdown 工具函数 +│ │ └── parser.py # 统一解析调度器 +│ ├── readers/ # 格式阅读器 +│ │ ├── base.py # Reader 基类 +│ │ ├── docx/ # DOCX 阅读器 +│ │ ├── xlsx/ # XLSX 阅读器 +│ │ ├── pptx/ # PPTX 阅读器 +│ │ ├── pdf/ # PDF 阅读器 +│ │ └── html/ # HTML/URL 阅读器 +│ └── utils/ # 工具函数 +│ ├── file_detection.py # 文件类型检测 +│ └── encoding_detection.py # 编码检测 +├── tests/ # 测试 +├── openspec/ # 规范文档 +└── README.md # 项目文档 ``` ## 解析器优先级 diff --git a/main.py b/main.py deleted file mode 100644 index a577bb9..0000000 --- a/main.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - print("Hello from lyxy-document!") - - -if __name__ == "__main__": - main() diff --git a/openspec/config.yaml b/openspec/config.yaml index dd89f0a..b0a1603 100644 --- a/openspec/config.yaml +++ b/openspec/config.yaml @@ -5,7 +5,6 @@ context: | - 语言: 仅中文(交流/注释/文档/代码) - Python: 始终用uv运行(脚本/临时命令uv run python -c); 禁用主机python/禁主机安装包 - 依赖: pyproject.toml声明,使用uv安装 - - 临时文件: 统一放temp目录 - 主机环境: 禁止污染配置,需操作须请求用户 - 文档: README.md,每次迭代按需更新用户文档和开发文档; 禁emoji/特殊字符 - 测试: 所有需求必须设计全面测试 @@ -13,3 +12,11 @@ context: | - 代码: 模块文件150-300行; 错误需自定义异常+清晰信息+位置上下文 - 项目阶段: 未上线,无用户,破坏性变更无需迁移说明 - Git提交: 仅中文; 格式为"类型: 简短描述",类型可选: feat(新功能)/fix(修复)/refactor(重构)/docs(文档)/style(格式)/test(测试)/chore(构建/工具); 多行描述空行后加详细说明 + + # 项目目录结构 + - scripts/: 核心代码目录 + - tests/: 测试目录 + - openspec/: 规范文档目录 + - temp/: 开发临时文件目录 + - pyproject.toml: 项目配置 + - README.md: 项目文档 diff --git a/pyproject.toml b/pyproject.toml index 27f5f49..45a571d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ dev = [ ] [project.scripts] -lyxy-document-reader = "lyxy_document_reader:main" +lyxy-document-reader = "scripts.lyxy_document_reader:main" [build-system] requires = ["hatchling"] @@ -91,4 +91,4 @@ disallow_untyped_defs = true [tool.pytest.ini_options] testpaths = ["tests"] -pythonpath = ["."] +pythonpath = ["scripts", "."] diff --git a/config.py b/scripts/config.py similarity index 100% rename from config.py rename to scripts/config.py diff --git a/core/__init__.py b/scripts/core/__init__.py similarity index 100% rename from core/__init__.py rename to scripts/core/__init__.py diff --git a/core/exceptions.py b/scripts/core/exceptions.py similarity index 100% rename from core/exceptions.py rename to scripts/core/exceptions.py diff --git a/core/markdown.py b/scripts/core/markdown.py similarity index 100% rename from core/markdown.py rename to scripts/core/markdown.py diff --git a/core/parser.py b/scripts/core/parser.py similarity index 93% rename from core/parser.py rename to scripts/core/parser.py index 484d50e..7d9616b 100644 --- a/core/parser.py +++ b/scripts/core/parser.py @@ -4,12 +4,12 @@ import argparse import sys from typing import List, Optional, Tuple -from core.exceptions import FileDetectionError, ReaderNotFoundError -from core.markdown import ( +from scripts.core.exceptions import FileDetectionError, ReaderNotFoundError +from scripts.core.markdown import ( normalize_markdown_whitespace, remove_markdown_images, ) -from readers import BaseReader +from scripts.readers import BaseReader def parse_input( diff --git a/lyxy_document_reader.py b/scripts/lyxy_document_reader.py similarity index 97% rename from lyxy_document_reader.py rename to scripts/lyxy_document_reader.py index 1625100..dbab245 100644 --- a/lyxy_document_reader.py +++ b/scripts/lyxy_document_reader.py @@ -20,14 +20,14 @@ logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s') logging.getLogger('docling').setLevel(logging.ERROR) logging.getLogger('unstructured').setLevel(logging.ERROR) -from core import ( +from scripts.core import ( FileDetectionError, ReaderNotFoundError, output_result, parse_input, process_content, ) -from readers import READERS +from scripts.readers import READERS def main() -> None: diff --git a/readers/__init__.py b/scripts/readers/__init__.py similarity index 100% rename from readers/__init__.py rename to scripts/readers/__init__.py diff --git a/readers/base.py b/scripts/readers/base.py similarity index 100% rename from readers/base.py rename to scripts/readers/base.py diff --git a/readers/docx/__init__.py b/scripts/readers/docx/__init__.py similarity index 94% rename from readers/docx/__init__.py rename to scripts/readers/docx/__init__.py index 628ed95..f12e851 100644 --- a/readers/docx/__init__.py +++ b/scripts/readers/docx/__init__.py @@ -3,8 +3,8 @@ import os from typing import List, Optional, Tuple -from readers.base import BaseReader -from utils import is_valid_docx +from scripts.readers.base import BaseReader +from scripts.utils import is_valid_docx from . import docling from . import unstructured diff --git a/readers/docx/docling.py b/scripts/readers/docx/docling.py similarity index 84% rename from readers/docx/docling.py rename to scripts/readers/docx/docling.py index f9743db..754fd7d 100644 --- a/readers/docx/docling.py +++ b/scripts/readers/docx/docling.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_docling +from scripts.core import parse_with_docling def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/docx/markitdown.py b/scripts/readers/docx/markitdown.py similarity index 83% rename from readers/docx/markitdown.py rename to scripts/readers/docx/markitdown.py index e0b9758..9308c2b 100644 --- a/readers/docx/markitdown.py +++ b/scripts/readers/docx/markitdown.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_markitdown +from scripts.core import parse_with_markitdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/docx/native_xml.py b/scripts/readers/docx/native_xml.py similarity index 98% rename from readers/docx/native_xml.py rename to scripts/readers/docx/native_xml.py index efc96ca..69c654e 100644 --- a/readers/docx/native_xml.py +++ b/scripts/readers/docx/native_xml.py @@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET import zipfile from typing import Any, Dict, List, Optional, Tuple -from core import build_markdown_table, safe_open_zip +from scripts.core import build_markdown_table, safe_open_zip def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/docx/pypandoc.py b/scripts/readers/docx/pypandoc.py similarity index 100% rename from readers/docx/pypandoc.py rename to scripts/readers/docx/pypandoc.py diff --git a/readers/docx/python_docx.py b/scripts/readers/docx/python_docx.py similarity index 98% rename from readers/docx/python_docx.py rename to scripts/readers/docx/python_docx.py index 62f041f..7b45a43 100644 --- a/readers/docx/python_docx.py +++ b/scripts/readers/docx/python_docx.py @@ -2,7 +2,7 @@ from typing import Any, List, Optional, Tuple -from core import build_markdown_table +from scripts.core import build_markdown_table def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/docx/unstructured.py b/scripts/readers/docx/unstructured.py similarity index 92% rename from readers/docx/unstructured.py rename to scripts/readers/docx/unstructured.py index b596ee5..6112239 100644 --- a/readers/docx/unstructured.py +++ b/scripts/readers/docx/unstructured.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import _unstructured_elements_to_markdown +from scripts.core import _unstructured_elements_to_markdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/html/__init__.py b/scripts/readers/html/__init__.py similarity index 95% rename from readers/html/__init__.py rename to scripts/readers/html/__init__.py index 9375f73..af0bd49 100644 --- a/readers/html/__init__.py +++ b/scripts/readers/html/__init__.py @@ -3,9 +3,9 @@ import os from typing import List, Optional, Tuple -from readers.base import BaseReader -from utils import is_url -import encoding_detection +from scripts.readers.base import BaseReader +from scripts.utils import is_url +from scripts.utils import encoding_detection from . import cleaner from . import downloader diff --git a/readers/html/cleaner.py b/scripts/readers/html/cleaner.py similarity index 100% rename from readers/html/cleaner.py rename to scripts/readers/html/cleaner.py diff --git a/readers/html/domscribe.py b/scripts/readers/html/domscribe.py similarity index 100% rename from readers/html/domscribe.py rename to scripts/readers/html/domscribe.py diff --git a/readers/html/downloader.py b/scripts/readers/html/downloader.py similarity index 100% rename from readers/html/downloader.py rename to scripts/readers/html/downloader.py diff --git a/readers/html/html2text.py b/scripts/readers/html/html2text.py similarity index 100% rename from readers/html/html2text.py rename to scripts/readers/html/html2text.py diff --git a/readers/html/markitdown.py b/scripts/readers/html/markitdown.py similarity index 100% rename from readers/html/markitdown.py rename to scripts/readers/html/markitdown.py diff --git a/readers/html/trafilatura.py b/scripts/readers/html/trafilatura.py similarity index 100% rename from readers/html/trafilatura.py rename to scripts/readers/html/trafilatura.py diff --git a/readers/pdf/__init__.py b/scripts/readers/pdf/__init__.py similarity index 94% rename from readers/pdf/__init__.py rename to scripts/readers/pdf/__init__.py index a33b301..6558a62 100644 --- a/readers/pdf/__init__.py +++ b/scripts/readers/pdf/__init__.py @@ -3,8 +3,8 @@ import os from typing import List, Optional, Tuple -from readers.base import BaseReader -from utils import is_valid_pdf +from scripts.readers.base import BaseReader +from scripts.utils import is_valid_pdf from . import docling_ocr from . import unstructured_ocr diff --git a/readers/pdf/docling.py b/scripts/readers/pdf/docling.py similarity index 100% rename from readers/pdf/docling.py rename to scripts/readers/pdf/docling.py diff --git a/readers/pdf/docling_ocr.py b/scripts/readers/pdf/docling_ocr.py similarity index 100% rename from readers/pdf/docling_ocr.py rename to scripts/readers/pdf/docling_ocr.py diff --git a/readers/pdf/markitdown.py b/scripts/readers/pdf/markitdown.py similarity index 83% rename from readers/pdf/markitdown.py rename to scripts/readers/pdf/markitdown.py index 43d8bda..1a17bc9 100644 --- a/readers/pdf/markitdown.py +++ b/scripts/readers/pdf/markitdown.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_markitdown +from scripts.core import parse_with_markitdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pdf/pypdf.py b/scripts/readers/pdf/pypdf.py similarity index 100% rename from readers/pdf/pypdf.py rename to scripts/readers/pdf/pypdf.py diff --git a/readers/pdf/unstructured.py b/scripts/readers/pdf/unstructured.py similarity index 93% rename from readers/pdf/unstructured.py rename to scripts/readers/pdf/unstructured.py index 27e8845..48868b1 100644 --- a/readers/pdf/unstructured.py +++ b/scripts/readers/pdf/unstructured.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import _unstructured_elements_to_markdown +from scripts.core import _unstructured_elements_to_markdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pdf/unstructured_ocr.py b/scripts/readers/pdf/unstructured_ocr.py similarity index 94% rename from readers/pdf/unstructured_ocr.py rename to scripts/readers/pdf/unstructured_ocr.py index 917f991..37f486d 100644 --- a/readers/pdf/unstructured_ocr.py +++ b/scripts/readers/pdf/unstructured_ocr.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import _unstructured_elements_to_markdown +from scripts.core import _unstructured_elements_to_markdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pptx/__init__.py b/scripts/readers/pptx/__init__.py similarity index 94% rename from readers/pptx/__init__.py rename to scripts/readers/pptx/__init__.py index 55b1674..05bb2cb 100644 --- a/readers/pptx/__init__.py +++ b/scripts/readers/pptx/__init__.py @@ -3,8 +3,8 @@ import os from typing import List, Optional, Tuple -from readers.base import BaseReader -from utils import is_valid_pptx +from scripts.readers.base import BaseReader +from scripts.utils import is_valid_pptx from . import docling from . import unstructured diff --git a/readers/pptx/docling.py b/scripts/readers/pptx/docling.py similarity index 84% rename from readers/pptx/docling.py rename to scripts/readers/pptx/docling.py index 8178f0e..e2e4b87 100644 --- a/readers/pptx/docling.py +++ b/scripts/readers/pptx/docling.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_docling +from scripts.core import parse_with_docling def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pptx/markitdown.py b/scripts/readers/pptx/markitdown.py similarity index 83% rename from readers/pptx/markitdown.py rename to scripts/readers/pptx/markitdown.py index 4711c1a..bcd2d17 100644 --- a/readers/pptx/markitdown.py +++ b/scripts/readers/pptx/markitdown.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_markitdown +from scripts.core import parse_with_markitdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pptx/native_xml.py b/scripts/readers/pptx/native_xml.py similarity index 98% rename from readers/pptx/native_xml.py rename to scripts/readers/pptx/native_xml.py index 57db449..bf27fff 100644 --- a/readers/pptx/native_xml.py +++ b/scripts/readers/pptx/native_xml.py @@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET import zipfile from typing import Any, List, Optional, Tuple -from core import build_markdown_table, flush_list_stack +from scripts.core import build_markdown_table, flush_list_stack def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pptx/python_pptx.py b/scripts/readers/pptx/python_pptx.py similarity index 98% rename from readers/pptx/python_pptx.py rename to scripts/readers/pptx/python_pptx.py index 95af88b..84ddad5 100644 --- a/readers/pptx/python_pptx.py +++ b/scripts/readers/pptx/python_pptx.py @@ -2,7 +2,7 @@ from typing import Any, List, Optional, Tuple -from core import build_markdown_table, flush_list_stack +from scripts.core import build_markdown_table, flush_list_stack def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/pptx/unstructured.py b/scripts/readers/pptx/unstructured.py similarity index 92% rename from readers/pptx/unstructured.py rename to scripts/readers/pptx/unstructured.py index d915706..bd37e04 100644 --- a/readers/pptx/unstructured.py +++ b/scripts/readers/pptx/unstructured.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import _unstructured_elements_to_markdown +from scripts.core import _unstructured_elements_to_markdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/xlsx/__init__.py b/scripts/readers/xlsx/__init__.py similarity index 94% rename from readers/xlsx/__init__.py rename to scripts/readers/xlsx/__init__.py index c225b4b..66e0077 100644 --- a/readers/xlsx/__init__.py +++ b/scripts/readers/xlsx/__init__.py @@ -3,8 +3,8 @@ import os from typing import List, Optional, Tuple -from readers.base import BaseReader -from utils import is_valid_xlsx +from scripts.readers.base import BaseReader +from scripts.utils import is_valid_xlsx from . import docling from . import unstructured diff --git a/readers/xlsx/docling.py b/scripts/readers/xlsx/docling.py similarity index 84% rename from readers/xlsx/docling.py rename to scripts/readers/xlsx/docling.py index 1ddbce8..cfd80b5 100644 --- a/readers/xlsx/docling.py +++ b/scripts/readers/xlsx/docling.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_docling +from scripts.core import parse_with_docling def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/xlsx/markitdown.py b/scripts/readers/xlsx/markitdown.py similarity index 83% rename from readers/xlsx/markitdown.py rename to scripts/readers/xlsx/markitdown.py index 6af2060..6864c7c 100644 --- a/readers/xlsx/markitdown.py +++ b/scripts/readers/xlsx/markitdown.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import parse_with_markitdown +from scripts.core import parse_with_markitdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/xlsx/native_xml.py b/scripts/readers/xlsx/native_xml.py similarity index 99% rename from readers/xlsx/native_xml.py rename to scripts/readers/xlsx/native_xml.py index 03fbb64..93ece2f 100644 --- a/readers/xlsx/native_xml.py +++ b/scripts/readers/xlsx/native_xml.py @@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET import zipfile from typing import List, Optional, Tuple -from core import build_markdown_table, safe_open_zip +from scripts.core import build_markdown_table, safe_open_zip def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/readers/xlsx/pandas.py b/scripts/readers/xlsx/pandas.py similarity index 100% rename from readers/xlsx/pandas.py rename to scripts/readers/xlsx/pandas.py diff --git a/readers/xlsx/unstructured.py b/scripts/readers/xlsx/unstructured.py similarity index 92% rename from readers/xlsx/unstructured.py rename to scripts/readers/xlsx/unstructured.py index c323816..b101c3b 100644 --- a/readers/xlsx/unstructured.py +++ b/scripts/readers/xlsx/unstructured.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from core import _unstructured_elements_to_markdown +from scripts.core import _unstructured_elements_to_markdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/utils/__init__.py b/scripts/utils/__init__.py similarity index 100% rename from utils/__init__.py rename to scripts/utils/__init__.py diff --git a/encoding_detection.py b/scripts/utils/encoding_detection.py similarity index 98% rename from encoding_detection.py rename to scripts/utils/encoding_detection.py index 60eeb72..32ccb01 100644 --- a/encoding_detection.py +++ b/scripts/utils/encoding_detection.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -from config import Config +from scripts.config import Config def detect_encoding(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/utils/file_detection.py b/scripts/utils/file_detection.py similarity index 100% rename from utils/file_detection.py rename to scripts/utils/file_detection.py diff --git a/tests/test_core/test_markdown.py b/tests/test_core/test_markdown.py index e21fe6e..047e6f4 100644 --- a/tests/test_core/test_markdown.py +++ b/tests/test_core/test_markdown.py @@ -1,6 +1,6 @@ """测试 Markdown 工具函数。""" -from core import ( +from scripts.core import ( get_heading_level, extract_titles, normalize_markdown_whitespace, diff --git a/tests/test_utils/test_file_detection.py b/tests/test_utils/test_file_detection.py index 9996052..17d3fc1 100644 --- a/tests/test_utils/test_file_detection.py +++ b/tests/test_utils/test_file_detection.py @@ -1,6 +1,6 @@ """测试文件检测工具函数。""" -from utils import is_url, is_html_file +from scripts.utils import is_url, is_html_file class TestIsUrl: