refactor: 将核心代码迁移到 scripts 目录

- 创建 scripts/ 目录作为核心代码根目录 - 移动 core/, readers/, utils/ 到 scripts/ 下 - 移动 config.py, lyxy_document_reader.py 到 scripts/ - 移动 encoding_detection.py 到 scripts/utils/ - 更新 pyproject.toml 中的入口点路径和 pytest 配置 - 更新所有内部导入语句为 scripts.* 模块 - 更新 README.md 目录结构说明 - 更新 openspec/config.yaml 添加目录结构说明 - 删除无用的 main.py 此变更使项目结构更清晰，便于区分核心代码与测试、文档等支撑文件。
2026-03-08 17:41:03 +08:00
parent 750ef50a8d
commit 15b63800a8
50 changed files with 66 additions and 60 deletions
--- a/scripts/readers/docx/python_docx.py
+++ b/scripts/readers/docx/python_docx.py
@@ -0,0 +1,118 @@
+"""使用 python-docx 库解析 DOCX 文件"""
+
+from typing import Any, List, Optional, Tuple
+
+from scripts.core import build_markdown_table
+
+
+def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 python-docx 库解析 DOCX 文件"""
+    try:
+        from docx import Document
+    except ImportError:
+        return None, "python-docx 库未安装"
+
+    try:
+        doc = Document(file_path)
+
+        _HEADING_LEVELS = {
+            "Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
+            "Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
+        }
+
+        def get_heading_level(para: Any) -> int:
+            if para.style and para.style.name:
+                return _HEADING_LEVELS.get(para.style.name, 0)
+            return 0
+
+        _LIST_STYLES = {
+            "Bullet": "bullet", "Number": "number",
+        }
+
+        def get_list_style(para: Any) -> Optional[str]:
+            if not para.style or not para.style.name:
+                return None
+            style_name = para.style.name
+            if style_name in _LIST_STYLES:
+                return _LIST_STYLES[style_name]
+            if style_name.startswith("List Bullet"):
+                return "bullet"
+            if style_name.startswith("List Number"):
+                return "number"
+            return None
+
+        def convert_runs_to_markdown(runs: List[Any]) -> str:
+            result = []
+            for run in runs:
+                text = run.text
+                if not text:
+                    continue
+                if run.bold:
+                    text = f"**{text}**"
+                if run.italic:
+                    text = f"*{text}*"
+                if run.underline:
+                    text = f"<u>{text}</u>"
+                result.append(text)
+            return "".join(result)
+
+        def convert_table_to_markdown(table: Any) -> str:
+            rows_data = []
+            for row in table.rows:
+                row_data = []
+                for cell in row.cells:
+                    cell_text = cell.text.strip().replace("\n", " ")
+                    row_data.append(cell_text)
+                rows_data.append(row_data)
+            return build_markdown_table(rows_data)
+
+        markdown_lines = []
+        prev_was_list = False
+
+        from docx.table import Table as DocxTable
+        from docx.text.paragraph import Paragraph
+
+        for element in doc.element.body:
+            if element.tag.endswith('}p'):
+                para = Paragraph(element, doc)
+                text = convert_runs_to_markdown(para.runs)
+                if not text.strip():
+                    continue
+
+                heading_level = get_heading_level(para)
+                if heading_level > 0:
+                    markdown_lines.append(f"{'#' * heading_level} {text}")
+                    prev_was_list = False
+                else:
+                    list_style = get_list_style(para)
+                    if list_style == "bullet":
+                        if not prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(f"- {text}")
+                        prev_was_list = True
+                    elif list_style == "number":
+                        if not prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(f"1. {text}")
+                        prev_was_list = True
+                    else:
+                        if prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(text)
+                        markdown_lines.append("")
+                        prev_was_list = False
+
+            elif element.tag.endswith('}tbl'):
+                table = DocxTable(element, doc)
+                table_md = convert_table_to_markdown(table)
+                if table_md:
+                    markdown_lines.append(table_md)
+                    markdown_lines.append("")
+                prev_was_list = False
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"python-docx 解析失败: {str(e)}"