完成多文档读取的脚本

2026-02-14 21:11:37 +08:00
parent 4c9effac0b
commit 8c27b08fdc
7 changed files with 1410 additions and 1288 deletions
--- a/temp/scripts/docx.py
+++ b/temp/scripts/docx.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""DOCX 文件解析模块，提供三种解析方法。"""
+
+import xml.etree.ElementTree as ET
+import zipfile
+from typing import Any, List, Optional, Tuple
+
+from common import build_markdown_table, safe_open_zip
+
+
+def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 MarkItDown 库解析 DOCX 文件"""
+    try:
+        from markitdown import MarkItDown
+
+        md = MarkItDown()
+        result = md.convert(file_path)
+        if not result.text_content.strip():
+            return None, "文档为空"
+        return result.text_content, None
+    except ImportError:
+        return None, "MarkItDown 库未安装"
+    except Exception as e:
+        return None, f"MarkItDown 解析失败: {str(e)}"
+
+
+def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 python-docx 库解析 DOCX 文件"""
+    try:
+        from docx import Document
+    except ImportError:
+        return None, "python-docx 库未安装"
+
+    try:
+        doc = Document(file_path)
+
+        def get_heading_level(para: Any) -> int:
+            if para.style and para.style.name:
+                style_name = para.style.name
+                if "Heading 1" in style_name or "Title" in style_name:
+                    return 1
+                elif "Heading 2" in style_name:
+                    return 2
+                elif "Heading 3" in style_name:
+                    return 3
+                elif "Heading 4" in style_name:
+                    return 4
+                elif "Heading 5" in style_name:
+                    return 5
+                elif "Heading 6" in style_name:
+                    return 6
+            return 0
+
+        def get_list_style(para: Any) -> Optional[str]:
+            if not para.style or not para.style.name:
+                return None
+            style_name = para.style.name
+            if style_name.startswith("List Bullet") or style_name == "Bullet":
+                return "bullet"
+            elif style_name.startswith("List Number") or style_name == "Number":
+                return "number"
+            return None
+
+        def convert_runs_to_markdown(runs: List[Any]) -> str:
+            result = []
+            for run in runs:
+                text = run.text
+                if not text:
+                    continue
+                if run.bold:
+                    text = f"**{text}**"
+                if run.italic:
+                    text = f"*{text}*"
+                if run.underline:
+                    text = f"<u>{text}</u>"
+                result.append(text)
+            return "".join(result)
+
+        def convert_table_to_markdown(table: Any) -> str:
+            rows_data = []
+            for row in table.rows:
+                row_data = []
+                for cell in row.cells:
+                    cell_text = cell.text.strip().replace("\n", " ")
+                    row_data.append(cell_text)
+                rows_data.append(row_data)
+            return build_markdown_table(rows_data)
+
+        markdown_lines = []
+        prev_was_list = False
+
+        for para in doc.paragraphs:
+            text = convert_runs_to_markdown(para.runs)
+            if not text.strip():
+                continue
+
+            heading_level = get_heading_level(para)
+            if heading_level > 0:
+                markdown_lines.append(f"{'#' * heading_level} {text}")
+                prev_was_list = False
+            else:
+                list_style = get_list_style(para)
+                if list_style == "bullet":
+                    if not prev_was_list and markdown_lines:
+                        markdown_lines.append("")
+                    markdown_lines.append(f"- {text}")
+                    prev_was_list = True
+                elif list_style == "number":
+                    if not prev_was_list and markdown_lines:
+                        markdown_lines.append("")
+                    markdown_lines.append(f"1. {text}")
+                    prev_was_list = True
+                else:
+                    if prev_was_list and markdown_lines:
+                        markdown_lines.append("")
+                    markdown_lines.append(text)
+                    markdown_lines.append("")
+                    prev_was_list = False
+
+        for table in doc.tables:
+            table_md = convert_table_to_markdown(table)
+            markdown_lines.append(table_md)
+            markdown_lines.append("")
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"python-docx 解析失败: {str(e)}"
+
+
+def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 XML 原生解析 DOCX 文件"""
+    word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+    namespaces = {"w": word_namespace}
+
+    def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
+        return style_to_level.get(style_id, 0)
+
+    def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
+        return style_to_list.get(style_id, None)
+
+    def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
+        texts = []
+        for run in para.findall(".//w:r", namespaces=namespaces):
+            text_elem = run.find(".//w:t", namespaces=namespaces)
+            if text_elem is not None and text_elem.text:
+                text = text_elem.text
+                bold = run.find(".//w:b", namespaces=namespaces) is not None
+                italic = run.find(".//w:i", namespaces=namespaces) is not None
+                if bold:
+                    text = f"**{text}**"
+                if italic:
+                    text = f"*{text}*"
+                texts.append(text)
+        return "".join(texts).strip()
+
+    def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
+        rows = table_elem.findall(".//w:tr", namespaces=namespaces)
+        if not rows:
+            return ""
+        md_lines = []
+        for i, row in enumerate(rows):
+            cells = row.findall(".//w:tc", namespaces=namespaces)
+            cell_texts = []
+            for cell in cells:
+                cell_text = extract_text_with_formatting(cell, namespaces)
+                cell_text = cell_text.replace("\n", " ").strip()
+                cell_texts.append(cell_text if cell_text else "")
+            if cell_texts:
+                md_line = "| " + " | ".join(cell_texts) + " |"
+                md_lines.append(md_line)
+                if i == 0:
+                    sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
+                    md_lines.append(sep_line)
+        return "\n".join(md_lines)
+
+    try:
+        style_to_level = {}
+        style_to_list = {}
+        markdown_lines = []
+
+        with zipfile.ZipFile(file_path) as zip_file:
+            try:
+                styles_file = safe_open_zip(zip_file, "word/styles.xml")
+                if styles_file:
+                    styles_root = ET.parse(styles_file).getroot()
+                    for style in styles_root.findall(
+                        ".//w:style", namespaces=namespaces
+                    ):
+                        style_id = style.get(f"{{{word_namespace}}}styleId")
+                        style_name_elem = style.find("w:name", namespaces=namespaces)
+                        if style_id and style_name_elem is not None:
+                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
+                            if style_name:
+                                if style_name == "Title":
+                                    style_to_level[style_id] = 1
+                                elif style_name == "heading 1":
+                                    style_to_level[style_id] = 1
+                                elif style_name == "heading 2":
+                                    style_to_level[style_id] = 2
+                                elif style_name == "heading 3":
+                                    style_to_level[style_id] = 3
+                                elif style_name == "heading 4":
+                                    style_to_level[style_id] = 4
+                                elif style_name == "heading 5":
+                                    style_to_level[style_id] = 5
+                                elif style_name == "heading 6":
+                                    style_to_level[style_id] = 6
+                                elif (
+                                    style_name.startswith("List Bullet")
+                                    or style_name == "Bullet"
+                                ):
+                                    style_to_list[style_id] = "bullet"
+                                elif (
+                                    style_name.startswith("List Number")
+                                    or style_name == "Number"
+                                ):
+                                    style_to_list[style_id] = "number"
+            except Exception:
+                pass
+
+            document_file = safe_open_zip(zip_file, "word/document.xml")
+            if not document_file:
+                return None, "document.xml 不存在或无法访问"
+
+            root = ET.parse(document_file).getroot()
+            body = root.find(".//w:body", namespaces=namespaces)
+            if body is None:
+                return None, "document.xml 中未找到 w:body 元素"
+
+            for child in body.findall("./*", namespaces=namespaces):
+                if child.tag.endswith("}p"):
+                    style_elem = child.find(".//w:pStyle", namespaces=namespaces)
+                    style_id = (
+                        style_elem.get(f"{{{word_namespace}}}val")
+                        if style_elem is not None
+                        else None
+                    )
+
+                    heading_level = get_heading_level(style_id, style_to_level)
+                    list_style = get_list_style(style_id, style_to_list)
+                    para_text = extract_text_with_formatting(child, namespaces)
+
+                    if para_text:
+                        if heading_level > 0:
+                            markdown_lines.append(f"{'#' * heading_level} {para_text}")
+                        elif list_style == "bullet":
+                            markdown_lines.append(f"- {para_text}")
+                        elif list_style == "number":
+                            markdown_lines.append(f"1. {para_text}")
+                        else:
+                            markdown_lines.append(para_text)
+                        markdown_lines.append("")
+
+                elif child.tag.endswith("}tbl"):
+                    table_md = convert_table_to_markdown(child, namespaces)
+                    if table_md:
+                        markdown_lines.append(table_md)
+                        markdown_lines.append("")
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"XML 解析失败: {str(e)}"