"""使用 XML 原生解析 DOCX 文件""" import xml.etree.ElementTree as ET import zipfile from typing import Any, Dict, List, Optional, Tuple from scripts.readers._utils import build_markdown_table, safe_open_zip def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 XML 原生解析 DOCX 文件""" word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" namespaces = {"w": word_namespace} _STYLE_NAME_TO_HEADING = { "title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3, "heading 4": 4, "heading 5": 5, "heading 6": 6, } def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int: return style_to_level.get(style_id, 0) def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]: return style_to_list.get(style_id, None) def extract_text_with_formatting(para: Any, namespaces: dict) -> str: texts = [] for run in para.findall(".//w:r", namespaces=namespaces): text_elem = run.find(".//w:t", namespaces=namespaces) if text_elem is not None and text_elem.text: text = text_elem.text bold = run.find(".//w:b", namespaces=namespaces) is not None italic = run.find(".//w:i", namespaces=namespaces) is not None if bold: text = f"**{text}**" if italic: text = f"*{text}*" texts.append(text) return "".join(texts).strip() def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str: rows = table_elem.findall(".//w:tr", namespaces=namespaces) if not rows: return "" rows_data = [] for row in rows: cells = row.findall(".//w:tc", namespaces=namespaces) cell_texts = [] for cell in cells: cell_text = extract_text_with_formatting(cell, namespaces) cell_text = cell_text.replace("\n", " ").strip() cell_texts.append(cell_text if cell_text else "") if cell_texts: rows_data.append(cell_texts) return build_markdown_table(rows_data) try: style_to_level = {} style_to_list = {} markdown_lines = [] with zipfile.ZipFile(file_path) as zip_file: try: styles_file = safe_open_zip(zip_file, "word/styles.xml") if styles_file: styles_root = ET.parse(styles_file).getroot() for style in styles_root.findall( ".//w:style", namespaces=namespaces ): style_id = style.get(f"{{{word_namespace}}}styleId") style_name_elem = style.find("w:name", namespaces=namespaces) if style_id and style_name_elem is not None: style_name = style_name_elem.get(f"{{{word_namespace}}}val") if style_name: style_name_lower = style_name.lower() if style_name_lower in _STYLE_NAME_TO_HEADING: style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower] elif ( style_name_lower.startswith("list bullet") or style_name_lower == "bullet" ): style_to_list[style_id] = "bullet" elif ( style_name_lower.startswith("list number") or style_name_lower == "number" ): style_to_list[style_id] = "number" except Exception: pass document_file = safe_open_zip(zip_file, "word/document.xml") if not document_file: return None, "document.xml 不存在或无法访问" root = ET.parse(document_file).getroot() body = root.find(".//w:body", namespaces=namespaces) if body is None: return None, "document.xml 中未找到 w:body 元素" for child in body.findall("./*", namespaces=namespaces): if child.tag.endswith("}p"): style_elem = child.find(".//w:pStyle", namespaces=namespaces) style_id = ( style_elem.get(f"{{{word_namespace}}}val") if style_elem is not None else None ) heading_level = get_heading_level(style_id, style_to_level) list_style = get_list_style(style_id, style_to_list) para_text = extract_text_with_formatting(child, namespaces) if para_text: if heading_level > 0: markdown_lines.append(f"{'#' * heading_level} {para_text}") elif list_style == "bullet": markdown_lines.append(f"- {para_text}") elif list_style == "number": markdown_lines.append(f"1. {para_text}") else: markdown_lines.append(para_text) markdown_lines.append("") elif child.tag.endswith("}tbl"): table_md = convert_table_to_markdown(child, namespaces) if table_md: markdown_lines.append(table_md) markdown_lines.append("") content = "\n".join(markdown_lines) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"XML 解析失败: {str(e)}"