lyxy-document/scripts/readers/docx/native_xml.py

"""使用 XML 原生解析 DOCX 文件"""

import xml.etree.ElementTree as ET
import zipfile
from typing import Any, Dict, List, Optional, Tuple

from readers._utils import build_markdown_table, safe_open_zip


def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 XML 原生解析 DOCX 文件"""
    word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    namespaces = {"w": word_namespace}

    _STYLE_NAME_TO_HEADING = {
        "title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
        "heading 4": 4, "heading 5": 5, "heading 6": 6,
    }

    def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
        return style_to_level.get(style_id, 0)

    def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
        return style_to_list.get(style_id, None)

    def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
        texts = []
        for run in para.findall(".//w:r", namespaces=namespaces):
            text_elem = run.find(".//w:t", namespaces=namespaces)
            if text_elem is not None and text_elem.text:
                text = text_elem.text
                bold = run.find(".//w:b", namespaces=namespaces) is not None
                italic = run.find(".//w:i", namespaces=namespaces) is not None
                if bold:
                    text = f"**{text}**"
                if italic:
                    text = f"*{text}*"
                texts.append(text)
        return "".join(texts).strip()

    def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
        rows = table_elem.findall(".//w:tr", namespaces=namespaces)
        if not rows:
            return ""
        rows_data = []
        for row in rows:
            cells = row.findall(".//w:tc", namespaces=namespaces)
            cell_texts = []
            for cell in cells:
                cell_text = extract_text_with_formatting(cell, namespaces)
                cell_text = cell_text.replace("\n", " ").strip()
                cell_texts.append(cell_text if cell_text else "")
            if cell_texts:
                rows_data.append(cell_texts)
        return build_markdown_table(rows_data)

    try:
        style_to_level = {}
        style_to_list = {}
        markdown_lines = []

        with zipfile.ZipFile(file_path) as zip_file:
            try:
                styles_file = safe_open_zip(zip_file, "word/styles.xml")
                if styles_file:
                    styles_root = ET.parse(styles_file).getroot()
                    for style in styles_root.findall(
                        ".//w:style", namespaces=namespaces
                    ):
                        style_id = style.get(f"{{{word_namespace}}}styleId")
                        style_name_elem = style.find("w:name", namespaces=namespaces)
                        if style_id and style_name_elem is not None:
                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
                            if style_name:
                                style_name_lower = style_name.lower()
                                if style_name_lower in _STYLE_NAME_TO_HEADING:
                                    style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
                                elif (
                                    style_name_lower.startswith("list bullet")
                                    or style_name_lower == "bullet"
                                ):
                                    style_to_list[style_id] = "bullet"
                                elif (
                                    style_name_lower.startswith("list number")
                                    or style_name_lower == "number"
                                ):
                                    style_to_list[style_id] = "number"
            except Exception:
                pass

            document_file = safe_open_zip(zip_file, "word/document.xml")
            if not document_file:
                return None, "document.xml 不存在或无法访问"

            root = ET.parse(document_file).getroot()
            body = root.find(".//w:body", namespaces=namespaces)
            if body is None:
                return None, "document.xml 中未找到 w:body 元素"

            for child in body.findall("./*", namespaces=namespaces):
                if child.tag.endswith("}p"):
                    style_elem = child.find(".//w:pStyle", namespaces=namespaces)
                    style_id = (
                        style_elem.get(f"{{{word_namespace}}}val")
                        if style_elem is not None
                        else None
                    )

                    heading_level = get_heading_level(style_id, style_to_level)
                    list_style = get_list_style(style_id, style_to_list)
                    para_text = extract_text_with_formatting(child, namespaces)

                    if para_text:
                        if heading_level > 0:
                            markdown_lines.append(f"{'#' * heading_level} {para_text}")
                        elif list_style == "bullet":
                            markdown_lines.append(f"- {para_text}")
                        elif list_style == "number":
                            markdown_lines.append(f"1. {para_text}")
                        else:
                            markdown_lines.append(para_text)
                        markdown_lines.append("")

                elif child.tag.endswith("}tbl"):
                    table_md = convert_table_to_markdown(child, namespaces)
                    if table_md:
                        markdown_lines.append(table_md)
                        markdown_lines.append("")

        content = "\n".join(markdown_lines)
        if not content.strip():
            return None, "文档为空"
        return content, None
    except Exception as e:
        return None, f"XML 解析失败: {str(e)}"