"""使用 python-docx 库解析 DOCX 文件""" from typing import Any, List, Optional, Tuple from core import build_markdown_table def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 python-docx 库解析 DOCX 文件""" try: from docx import Document except ImportError: return None, "python-docx 库未安装" try: doc = Document(file_path) _HEADING_LEVELS = { "Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3, "Heading 4": 4, "Heading 5": 5, "Heading 6": 6, } def get_heading_level(para: Any) -> int: if para.style and para.style.name: return _HEADING_LEVELS.get(para.style.name, 0) return 0 _LIST_STYLES = { "Bullet": "bullet", "Number": "number", } def get_list_style(para: Any) -> Optional[str]: if not para.style or not para.style.name: return None style_name = para.style.name if style_name in _LIST_STYLES: return _LIST_STYLES[style_name] if style_name.startswith("List Bullet"): return "bullet" if style_name.startswith("List Number"): return "number" return None def convert_runs_to_markdown(runs: List[Any]) -> str: result = [] for run in runs: text = run.text if not text: continue if run.bold: text = f"**{text}**" if run.italic: text = f"*{text}*" if run.underline: text = f"{text}" result.append(text) return "".join(result) def convert_table_to_markdown(table: Any) -> str: rows_data = [] for row in table.rows: row_data = [] for cell in row.cells: cell_text = cell.text.strip().replace("\n", " ") row_data.append(cell_text) rows_data.append(row_data) return build_markdown_table(rows_data) markdown_lines = [] prev_was_list = False from docx.table import Table as DocxTable from docx.text.paragraph import Paragraph for element in doc.element.body: if element.tag.endswith('}p'): para = Paragraph(element, doc) text = convert_runs_to_markdown(para.runs) if not text.strip(): continue heading_level = get_heading_level(para) if heading_level > 0: markdown_lines.append(f"{'#' * heading_level} {text}") prev_was_list = False else: list_style = get_list_style(para) if list_style == "bullet": if not prev_was_list and markdown_lines: markdown_lines.append("") markdown_lines.append(f"- {text}") prev_was_list = True elif list_style == "number": if not prev_was_list and markdown_lines: markdown_lines.append("") markdown_lines.append(f"1. {text}") prev_was_list = True else: if prev_was_list and markdown_lines: markdown_lines.append("") markdown_lines.append(text) markdown_lines.append("") prev_was_list = False elif element.tag.endswith('}tbl'): table = DocxTable(element, doc) table_md = convert_table_to_markdown(table) if table_md: markdown_lines.append(table_md) markdown_lines.append("") prev_was_list = False content = "\n".join(markdown_lines) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"python-docx 解析失败: {str(e)}"