#!/usr/bin/env python3 """DOCX 文件解析模块,提供三种解析方法。""" import xml.etree.ElementTree as ET import zipfile from typing import Any, List, Optional, Tuple from common import build_markdown_table, parse_with_markitdown, safe_open_zip def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 DOCX 文件""" return parse_with_markitdown(file_path) def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 python-docx 库解析 DOCX 文件""" try: from docx import Document except ImportError: return None, "python-docx 库未安装" try: doc = Document(file_path) def get_heading_level(para: Any) -> int: if para.style and para.style.name: style_name = para.style.name if style_name == "Title": return 1 elif style_name == "Heading 1": return 1 elif style_name == "Heading 2": return 2 elif style_name == "Heading 3": return 3 elif style_name == "Heading 4": return 4 elif style_name == "Heading 5": return 5 elif style_name == "Heading 6": return 6 return 0 def get_list_style(para: Any) -> Optional[str]: if not para.style or not para.style.name: return None style_name = para.style.name if style_name.startswith("List Bullet") or style_name == "Bullet": return "bullet" elif style_name.startswith("List Number") or style_name == "Number": return "number" return None def convert_runs_to_markdown(runs: List[Any]) -> str: result = [] for run in runs: text = run.text if not text: continue if run.bold: text = f"**{text}**" if run.italic: text = f"*{text}*" if run.underline: text = f"{text}" result.append(text) return "".join(result) def convert_table_to_markdown(table: Any) -> str: rows_data = [] for row in table.rows: row_data = [] for cell in row.cells: cell_text = cell.text.strip().replace("\n", " ") row_data.append(cell_text) rows_data.append(row_data) return build_markdown_table(rows_data) markdown_lines = [] prev_was_list = False from docx.table import Table as DocxTable from docx.text.paragraph import Paragraph for element in doc.element.body: if element.tag.endswith('}p'): para = Paragraph(element, doc) text = convert_runs_to_markdown(para.runs) if not text.strip(): continue heading_level = get_heading_level(para) if heading_level > 0: markdown_lines.append(f"{'#' * heading_level} {text}") prev_was_list = False else: list_style = get_list_style(para) if list_style == "bullet": if not prev_was_list and markdown_lines: markdown_lines.append("") markdown_lines.append(f"- {text}") prev_was_list = True elif list_style == "number": if not prev_was_list and markdown_lines: markdown_lines.append("") markdown_lines.append(f"1. {text}") prev_was_list = True else: if prev_was_list and markdown_lines: markdown_lines.append("") markdown_lines.append(text) markdown_lines.append("") prev_was_list = False elif element.tag.endswith('}tbl'): table = DocxTable(element, doc) table_md = convert_table_to_markdown(table) if table_md: markdown_lines.append(table_md) markdown_lines.append("") prev_was_list = False content = "\n".join(markdown_lines) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"python-docx 解析失败: {str(e)}" def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 XML 原生解析 DOCX 文件""" word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" namespaces = {"w": word_namespace} def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int: return style_to_level.get(style_id, 0) def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]: return style_to_list.get(style_id, None) def extract_text_with_formatting(para: Any, namespaces: dict) -> str: texts = [] for run in para.findall(".//w:r", namespaces=namespaces): text_elem = run.find(".//w:t", namespaces=namespaces) if text_elem is not None and text_elem.text: text = text_elem.text bold = run.find(".//w:b", namespaces=namespaces) is not None italic = run.find(".//w:i", namespaces=namespaces) is not None if bold: text = f"**{text}**" if italic: text = f"*{text}*" texts.append(text) return "".join(texts).strip() def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str: rows = table_elem.findall(".//w:tr", namespaces=namespaces) if not rows: return "" md_lines = [] for i, row in enumerate(rows): cells = row.findall(".//w:tc", namespaces=namespaces) cell_texts = [] for cell in cells: cell_text = extract_text_with_formatting(cell, namespaces) cell_text = cell_text.replace("\n", " ").strip() cell_texts.append(cell_text if cell_text else "") if cell_texts: md_line = "| " + " | ".join(cell_texts) + " |" md_lines.append(md_line) if i == 0: sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |" md_lines.append(sep_line) return "\n".join(md_lines) try: style_to_level = {} style_to_list = {} markdown_lines = [] with zipfile.ZipFile(file_path) as zip_file: try: styles_file = safe_open_zip(zip_file, "word/styles.xml") if styles_file: styles_root = ET.parse(styles_file).getroot() for style in styles_root.findall( ".//w:style", namespaces=namespaces ): style_id = style.get(f"{{{word_namespace}}}styleId") style_name_elem = style.find("w:name", namespaces=namespaces) if style_id and style_name_elem is not None: style_name = style_name_elem.get(f"{{{word_namespace}}}val") if style_name: style_name_lower = style_name.lower() if style_name_lower == "title": style_to_level[style_id] = 1 elif style_name_lower == "heading 1": style_to_level[style_id] = 1 elif style_name_lower == "heading 2": style_to_level[style_id] = 2 elif style_name_lower == "heading 3": style_to_level[style_id] = 3 elif style_name_lower == "heading 4": style_to_level[style_id] = 4 elif style_name_lower == "heading 5": style_to_level[style_id] = 5 elif style_name_lower == "heading 6": style_to_level[style_id] = 6 elif ( style_name_lower.startswith("list bullet") or style_name_lower == "bullet" ): style_to_list[style_id] = "bullet" elif ( style_name_lower.startswith("list number") or style_name_lower == "number" ): style_to_list[style_id] = "number" except Exception: pass document_file = safe_open_zip(zip_file, "word/document.xml") if not document_file: return None, "document.xml 不存在或无法访问" root = ET.parse(document_file).getroot() body = root.find(".//w:body", namespaces=namespaces) if body is None: return None, "document.xml 中未找到 w:body 元素" for child in body.findall("./*", namespaces=namespaces): if child.tag.endswith("}p"): style_elem = child.find(".//w:pStyle", namespaces=namespaces) style_id = ( style_elem.get(f"{{{word_namespace}}}val") if style_elem is not None else None ) heading_level = get_heading_level(style_id, style_to_level) list_style = get_list_style(style_id, style_to_list) para_text = extract_text_with_formatting(child, namespaces) if para_text: if heading_level > 0: markdown_lines.append(f"{'#' * heading_level} {para_text}") elif list_style == "bullet": markdown_lines.append(f"- {para_text}") elif list_style == "number": markdown_lines.append(f"1. {para_text}") else: markdown_lines.append(para_text) markdown_lines.append("") elif child.tag.endswith("}tbl"): table_md = convert_table_to_markdown(child, namespaces) if table_md: markdown_lines.append(table_md) markdown_lines.append("") content = "\n".join(markdown_lines) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"XML 解析失败: {str(e)}"