refactor: 将核心代码迁移到 scripts 目录

- 创建 scripts/ 目录作为核心代码根目录 - 移动 core/, readers/, utils/ 到 scripts/ 下 - 移动 config.py, lyxy_document_reader.py 到 scripts/ - 移动 encoding_detection.py 到 scripts/utils/ - 更新 pyproject.toml 中的入口点路径和 pytest 配置 - 更新所有内部导入语句为 scripts.* 模块 - 更新 README.md 目录结构说明 - 更新 openspec/config.yaml 添加目录结构说明 - 删除无用的 main.py 此变更使项目结构更清晰，便于区分核心代码与测试、文档等支撑文件。
2026-03-08 17:41:03 +08:00
parent 750ef50a8d
commit 15b63800a8
50 changed files with 66 additions and 60 deletions
--- a/scripts/readers/docx/init.py
+++ b/scripts/readers/docx/init.py
@@ -0,0 +1,57 @@
+"""DOCX 文件阅读器，支持多种解析方法。"""
+
+import os
+from typing import List, Optional, Tuple
+
+from scripts.readers.base import BaseReader
+from scripts.utils import is_valid_docx
+
+from . import docling
+from . import unstructured
+from . import markitdown
+from . import pypandoc
+from . import python_docx
+from . import native_xml
+
+
+PARSERS = [
+    ("docling", docling.parse),
+    ("unstructured", unstructured.parse),
+    ("pypandoc-binary", pypandoc.parse),
+    ("MarkItDown", markitdown.parse),
+    ("python-docx", python_docx.parse),
+    ("XML 原生解析", native_xml.parse),
+]
+
+
+class DocxReader(BaseReader):
+    """DOCX 文件阅读器"""
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        return [".docx"]
+
+    def supports(self, file_path: str) -> bool:
+        return file_path.endswith('.docx')
+
+    def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
+        failures = []
+
+        # 检查文件是否存在
+        if not os.path.exists(file_path):
+            return None, ["文件不存在"]
+
+        # 验证文件格式
+        if not is_valid_docx(file_path):
+            return None, ["不是有效的 DOCX 文件"]
+
+        content = None
+
+        for parser_name, parser_func in PARSERS:
+            content, error = parser_func(file_path)
+            if content is not None:
+                return content, failures
+            else:
+                failures.append(f"- {parser_name}: {error}")
+
+        return None, failures
--- a/scripts/readers/docx/docling.py
+++ b/scripts/readers/docx/docling.py
@@ -0,0 +1,10 @@
+"""使用 docling 库解析 DOCX 文件"""
+
+from typing import Optional, Tuple
+
+from scripts.core import parse_with_docling
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 docling 库解析 DOCX 文件"""
+    return parse_with_docling(file_path)
--- a/scripts/readers/docx/markitdown.py
+++ b/scripts/readers/docx/markitdown.py
@@ -0,0 +1,10 @@
+"""使用 MarkItDown 库解析 DOCX 文件"""
+
+from typing import Optional, Tuple
+
+from scripts.core import parse_with_markitdown
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 MarkItDown 库解析 DOCX 文件"""
+    return parse_with_markitdown(file_path)
--- a/scripts/readers/docx/native_xml.py
+++ b/scripts/readers/docx/native_xml.py
@@ -0,0 +1,135 @@
+"""使用 XML 原生解析 DOCX 文件"""
+
+import xml.etree.ElementTree as ET
+import zipfile
+from typing import Any, Dict, List, Optional, Tuple
+
+from scripts.core import build_markdown_table, safe_open_zip
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 XML 原生解析 DOCX 文件"""
+    word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+    namespaces = {"w": word_namespace}
+
+    _STYLE_NAME_TO_HEADING = {
+        "title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
+        "heading 4": 4, "heading 5": 5, "heading 6": 6,
+    }
+
+    def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
+        return style_to_level.get(style_id, 0)
+
+    def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
+        return style_to_list.get(style_id, None)
+
+    def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
+        texts = []
+        for run in para.findall(".//w:r", namespaces=namespaces):
+            text_elem = run.find(".//w:t", namespaces=namespaces)
+            if text_elem is not None and text_elem.text:
+                text = text_elem.text
+                bold = run.find(".//w:b", namespaces=namespaces) is not None
+                italic = run.find(".//w:i", namespaces=namespaces) is not None
+                if bold:
+                    text = f"**{text}**"
+                if italic:
+                    text = f"*{text}*"
+                texts.append(text)
+        return "".join(texts).strip()
+
+    def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
+        rows = table_elem.findall(".//w:tr", namespaces=namespaces)
+        if not rows:
+            return ""
+        rows_data = []
+        for row in rows:
+            cells = row.findall(".//w:tc", namespaces=namespaces)
+            cell_texts = []
+            for cell in cells:
+                cell_text = extract_text_with_formatting(cell, namespaces)
+                cell_text = cell_text.replace("\n", " ").strip()
+                cell_texts.append(cell_text if cell_text else "")
+            if cell_texts:
+                rows_data.append(cell_texts)
+        return build_markdown_table(rows_data)
+
+    try:
+        style_to_level = {}
+        style_to_list = {}
+        markdown_lines = []
+
+        with zipfile.ZipFile(file_path) as zip_file:
+            try:
+                styles_file = safe_open_zip(zip_file, "word/styles.xml")
+                if styles_file:
+                    styles_root = ET.parse(styles_file).getroot()
+                    for style in styles_root.findall(
+                        ".//w:style", namespaces=namespaces
+                    ):
+                        style_id = style.get(f"{{{word_namespace}}}styleId")
+                        style_name_elem = style.find("w:name", namespaces=namespaces)
+                        if style_id and style_name_elem is not None:
+                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
+                            if style_name:
+                                style_name_lower = style_name.lower()
+                                if style_name_lower in _STYLE_NAME_TO_HEADING:
+                                    style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
+                                elif (
+                                    style_name_lower.startswith("list bullet")
+                                    or style_name_lower == "bullet"
+                                ):
+                                    style_to_list[style_id] = "bullet"
+                                elif (
+                                    style_name_lower.startswith("list number")
+                                    or style_name_lower == "number"
+                                ):
+                                    style_to_list[style_id] = "number"
+            except Exception:
+                pass
+
+            document_file = safe_open_zip(zip_file, "word/document.xml")
+            if not document_file:
+                return None, "document.xml 不存在或无法访问"
+
+            root = ET.parse(document_file).getroot()
+            body = root.find(".//w:body", namespaces=namespaces)
+            if body is None:
+                return None, "document.xml 中未找到 w:body 元素"
+
+            for child in body.findall("./*", namespaces=namespaces):
+                if child.tag.endswith("}p"):
+                    style_elem = child.find(".//w:pStyle", namespaces=namespaces)
+                    style_id = (
+                        style_elem.get(f"{{{word_namespace}}}val")
+                        if style_elem is not None
+                        else None
+                    )
+
+                    heading_level = get_heading_level(style_id, style_to_level)
+                    list_style = get_list_style(style_id, style_to_list)
+                    para_text = extract_text_with_formatting(child, namespaces)
+
+                    if para_text:
+                        if heading_level > 0:
+                            markdown_lines.append(f"{'#' * heading_level} {para_text}")
+                        elif list_style == "bullet":
+                            markdown_lines.append(f"- {para_text}")
+                        elif list_style == "number":
+                            markdown_lines.append(f"1. {para_text}")
+                        else:
+                            markdown_lines.append(para_text)
+                        markdown_lines.append("")
+
+                elif child.tag.endswith("}tbl"):
+                    table_md = convert_table_to_markdown(child, namespaces)
+                    if table_md:
+                        markdown_lines.append(table_md)
+                        markdown_lines.append("")
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"XML 解析失败: {str(e)}"
--- a/scripts/readers/docx/pypandoc.py
+++ b/scripts/readers/docx/pypandoc.py
@@ -0,0 +1,29 @@
+"""使用 pypandoc-binary 库解析 DOCX 文件"""
+
+from typing import Optional, Tuple
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 pypandoc-binary 库解析 DOCX 文件"""
+    try:
+        import pypandoc
+    except ImportError:
+        return None, "pypandoc-binary 库未安装"
+
+    try:
+        content = pypandoc.convert_file(
+            source_file=file_path,
+            to="md",
+            format="docx",
+            outputfile=None,
+            extra_args=["--wrap=none"],
+        )
+    except OSError as exc:
+        return None, f"pypandoc-binary 缺少 Pandoc 可执行文件: {exc}"
+    except RuntimeError as exc:
+        return None, f"pypandoc-binary 解析失败: {exc}"
+
+    content = content.strip()
+    if not content:
+        return None, "文档为空"
+    return content, None
--- a/scripts/readers/docx/python_docx.py
+++ b/scripts/readers/docx/python_docx.py
@@ -0,0 +1,118 @@
+"""使用 python-docx 库解析 DOCX 文件"""
+
+from typing import Any, List, Optional, Tuple
+
+from scripts.core import build_markdown_table
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 python-docx 库解析 DOCX 文件"""
+    try:
+        from docx import Document
+    except ImportError:
+        return None, "python-docx 库未安装"
+
+    try:
+        doc = Document(file_path)
+
+        _HEADING_LEVELS = {
+            "Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
+            "Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
+        }
+
+        def get_heading_level(para: Any) -> int:
+            if para.style and para.style.name:
+                return _HEADING_LEVELS.get(para.style.name, 0)
+            return 0
+
+        _LIST_STYLES = {
+            "Bullet": "bullet", "Number": "number",
+        }
+
+        def get_list_style(para: Any) -> Optional[str]:
+            if not para.style or not para.style.name:
+                return None
+            style_name = para.style.name
+            if style_name in _LIST_STYLES:
+                return _LIST_STYLES[style_name]
+            if style_name.startswith("List Bullet"):
+                return "bullet"
+            if style_name.startswith("List Number"):
+                return "number"
+            return None
+
+        def convert_runs_to_markdown(runs: List[Any]) -> str:
+            result = []
+            for run in runs:
+                text = run.text
+                if not text:
+                    continue
+                if run.bold:
+                    text = f"**{text}**"
+                if run.italic:
+                    text = f"*{text}*"
+                if run.underline:
+                    text = f"<u>{text}</u>"
+                result.append(text)
+            return "".join(result)
+
+        def convert_table_to_markdown(table: Any) -> str:
+            rows_data = []
+            for row in table.rows:
+                row_data = []
+                for cell in row.cells:
+                    cell_text = cell.text.strip().replace("\n", " ")
+                    row_data.append(cell_text)
+                rows_data.append(row_data)
+            return build_markdown_table(rows_data)
+
+        markdown_lines = []
+        prev_was_list = False
+
+        from docx.table import Table as DocxTable
+        from docx.text.paragraph import Paragraph
+
+        for element in doc.element.body:
+            if element.tag.endswith('}p'):
+                para = Paragraph(element, doc)
+                text = convert_runs_to_markdown(para.runs)
+                if not text.strip():
+                    continue
+
+                heading_level = get_heading_level(para)
+                if heading_level > 0:
+                    markdown_lines.append(f"{'#' * heading_level} {text}")
+                    prev_was_list = False
+                else:
+                    list_style = get_list_style(para)
+                    if list_style == "bullet":
+                        if not prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(f"- {text}")
+                        prev_was_list = True
+                    elif list_style == "number":
+                        if not prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(f"1. {text}")
+                        prev_was_list = True
+                    else:
+                        if prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(text)
+                        markdown_lines.append("")
+                        prev_was_list = False
+
+            elif element.tag.endswith('}tbl'):
+                table = DocxTable(element, doc)
+                table_md = convert_table_to_markdown(table)
+                if table_md:
+                    markdown_lines.append(table_md)
+                    markdown_lines.append("")
+                prev_was_list = False
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"python-docx 解析失败: {str(e)}"
--- a/scripts/readers/docx/unstructured.py
+++ b/scripts/readers/docx/unstructured.py
@@ -0,0 +1,22 @@
+"""使用 unstructured 库解析 DOCX 文件"""
+
+from typing import Optional, Tuple
+
+from scripts.core import _unstructured_elements_to_markdown
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 unstructured 库解析 DOCX 文件"""
+    try:
+        from unstructured.partition.docx import partition_docx
+    except ImportError:
+        return None, "unstructured 库未安装"
+
+    try:
+        elements = partition_docx(filename=file_path, infer_table_structure=True)
+        content = _unstructured_elements_to_markdown(elements)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"unstructured 解析失败: {str(e)}"