lyxy-document/scripts/readers/docx/python_docx.py

"""使用 python-docx 库解析 DOCX 文件"""

from typing import Any, List, Optional, Tuple

from scripts.readers._utils import build_markdown_table


def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 python-docx 库解析 DOCX 文件"""
    try:
        from docx import Document
    except ImportError:
        return None, "python-docx 库未安装"

    try:
        doc = Document(file_path)

        _HEADING_LEVELS = {
            "Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
            "Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
        }

        def get_heading_level(para: Any) -> int:
            if para.style and para.style.name:
                return _HEADING_LEVELS.get(para.style.name, 0)
            return 0

        _LIST_STYLES = {
            "Bullet": "bullet", "Number": "number",
        }

        def get_list_style(para: Any) -> Optional[str]:
            if not para.style or not para.style.name:
                return None
            style_name = para.style.name
            if style_name in _LIST_STYLES:
                return _LIST_STYLES[style_name]
            if style_name.startswith("List Bullet"):
                return "bullet"
            if style_name.startswith("List Number"):
                return "number"
            return None

        def convert_runs_to_markdown(runs: List[Any]) -> str:
            result = []
            for run in runs:
                text = run.text
                if not text:
                    continue
                if run.bold:
                    text = f"**{text}**"
                if run.italic:
                    text = f"*{text}*"
                if run.underline:
                    text = f"<u>{text}</u>"
                result.append(text)
            return "".join(result)

        def convert_table_to_markdown(table: Any) -> str:
            rows_data = []
            for row in table.rows:
                row_data = []
                for cell in row.cells:
                    cell_text = cell.text.strip().replace("\n", " ")
                    row_data.append(cell_text)
                rows_data.append(row_data)
            return build_markdown_table(rows_data)

        markdown_lines = []
        prev_was_list = False

        from docx.table import Table as DocxTable
        from docx.text.paragraph import Paragraph

        for element in doc.element.body:
            if element.tag.endswith('}p'):
                para = Paragraph(element, doc)
                text = convert_runs_to_markdown(para.runs)
                if not text.strip():
                    continue

                heading_level = get_heading_level(para)
                if heading_level > 0:
                    markdown_lines.append(f"{'#' * heading_level} {text}")
                    prev_was_list = False
                else:
                    list_style = get_list_style(para)
                    if list_style == "bullet":
                        if not prev_was_list and markdown_lines:
                            markdown_lines.append("")
                        markdown_lines.append(f"- {text}")
                        prev_was_list = True
                    elif list_style == "number":
                        if not prev_was_list and markdown_lines:
                            markdown_lines.append("")
                        markdown_lines.append(f"1. {text}")
                        prev_was_list = True
                    else:
                        if prev_was_list and markdown_lines:
                            markdown_lines.append("")
                        markdown_lines.append(text)
                        markdown_lines.append("")
                        prev_was_list = False

            elif element.tag.endswith('}tbl'):
                table = DocxTable(element, doc)
                table_md = convert_table_to_markdown(table)
                if table_md:
                    markdown_lines.append(table_md)
                    markdown_lines.append("")
                prev_was_list = False

        content = "\n".join(markdown_lines)
        if not content.strip():
            return None, "文档为空"
        return content, None
    except Exception as e:
        return None, f"python-docx 解析失败: {str(e)}"