增加lyxy-reader-office skill

2026-02-17 22:50:06 +08:00
parent 9f686270c2
commit 9f04dac50b
25 changed files with 609 additions and 1282 deletions
--- a/skills/lyxy-reader-office/scripts/docx_parser.py
+++ b/skills/lyxy-reader-office/scripts/docx_parser.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""DOCX 文件解析模块，提供多种解析方法。"""
+
+import xml.etree.ElementTree as ET
+import zipfile
+from typing import Any, List, Optional, Tuple
+
+from common import (
+    _unstructured_elements_to_markdown,
+    build_markdown_table,
+    parse_with_docling,
+    parse_with_markitdown,
+    safe_open_zip,
+)
+
+
+def parse_docx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 docling 库解析 DOCX 文件"""
+    return parse_with_docling(file_path)
+
+
+def parse_docx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 unstructured 库解析 DOCX 文件"""
+    try:
+        from unstructured.partition.docx import partition_docx
+    except ImportError:
+        return None, "unstructured 库未安装"
+
+    try:
+        elements = partition_docx(filename=file_path, infer_table_structure=True)
+        content = _unstructured_elements_to_markdown(elements)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"unstructured 解析失败: {str(e)}"
+
+
+def parse_docx_with_pypandoc(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 pypandoc-binary 库解析 DOCX 文件。"""
+    try:
+        import pypandoc
+    except ImportError:
+        return None, "pypandoc-binary 库未安装"
+
+    try:
+        content = pypandoc.convert_file(
+            source_file=file_path,
+            to="md",
+            format="docx",
+            outputfile=None,
+            extra_args=["--wrap=none"],
+        )
+    except OSError as exc:
+        return None, f"pypandoc-binary 缺少 Pandoc 可执行文件: {exc}"
+    except RuntimeError as exc:
+        return None, f"pypandoc-binary 解析失败: {exc}"
+
+    content = content.strip()
+    if not content:
+        return None, "文档为空"
+    return content, None
+
+
+def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 MarkItDown 库解析 DOCX 文件"""
+    return parse_with_markitdown(file_path)
+
+
+def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 python-docx 库解析 DOCX 文件"""
+    try:
+        from docx import Document
+    except ImportError:
+        return None, "python-docx 库未安装"
+
+    try:
+        doc = Document(file_path)
+
+        _HEADING_LEVELS = {
+            "Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
+            "Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
+        }
+
+        def get_heading_level(para: Any) -> int:
+            if para.style and para.style.name:
+                return _HEADING_LEVELS.get(para.style.name, 0)
+            return 0
+
+        _LIST_STYLES = {
+            "Bullet": "bullet", "Number": "number",
+        }
+
+        def get_list_style(para: Any) -> Optional[str]:
+            if not para.style or not para.style.name:
+                return None
+            style_name = para.style.name
+            if style_name in _LIST_STYLES:
+                return _LIST_STYLES[style_name]
+            if style_name.startswith("List Bullet"):
+                return "bullet"
+            if style_name.startswith("List Number"):
+                return "number"
+            return None
+
+        def convert_runs_to_markdown(runs: List[Any]) -> str:
+            result = []
+            for run in runs:
+                text = run.text
+                if not text:
+                    continue
+                if run.bold:
+                    text = f"**{text}**"
+                if run.italic:
+                    text = f"*{text}*"
+                if run.underline:
+                    text = f"<u>{text}</u>"
+                result.append(text)
+            return "".join(result)
+
+        def convert_table_to_markdown(table: Any) -> str:
+            rows_data = []
+            for row in table.rows:
+                row_data = []
+                for cell in row.cells:
+                    cell_text = cell.text.strip().replace("\n", " ")
+                    row_data.append(cell_text)
+                rows_data.append(row_data)
+            return build_markdown_table(rows_data)
+
+        markdown_lines = []
+        prev_was_list = False
+
+        from docx.table import Table as DocxTable
+        from docx.text.paragraph import Paragraph
+
+        for element in doc.element.body:
+            if element.tag.endswith('}p'):
+                para = Paragraph(element, doc)
+                text = convert_runs_to_markdown(para.runs)
+                if not text.strip():
+                    continue
+
+                heading_level = get_heading_level(para)
+                if heading_level > 0:
+                    markdown_lines.append(f"{'#' * heading_level} {text}")
+                    prev_was_list = False
+                else:
+                    list_style = get_list_style(para)
+                    if list_style == "bullet":
+                        if not prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(f"- {text}")
+                        prev_was_list = True
+                    elif list_style == "number":
+                        if not prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(f"1. {text}")
+                        prev_was_list = True
+                    else:
+                        if prev_was_list and markdown_lines:
+                            markdown_lines.append("")
+                        markdown_lines.append(text)
+                        markdown_lines.append("")
+                        prev_was_list = False
+
+            elif element.tag.endswith('}tbl'):
+                table = DocxTable(element, doc)
+                table_md = convert_table_to_markdown(table)
+                if table_md:
+                    markdown_lines.append(table_md)
+                    markdown_lines.append("")
+                prev_was_list = False
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"python-docx 解析失败: {str(e)}"
+
+
+def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 XML 原生解析 DOCX 文件"""
+    word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+    namespaces = {"w": word_namespace}
+
+    _STYLE_NAME_TO_HEADING = {
+        "title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
+        "heading 4": 4, "heading 5": 5, "heading 6": 6,
+    }
+
+    def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
+        return style_to_level.get(style_id, 0)
+
+    def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
+        return style_to_list.get(style_id, None)
+
+    def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
+        texts = []
+        for run in para.findall(".//w:r", namespaces=namespaces):
+            text_elem = run.find(".//w:t", namespaces=namespaces)
+            if text_elem is not None and text_elem.text:
+                text = text_elem.text
+                bold = run.find(".//w:b", namespaces=namespaces) is not None
+                italic = run.find(".//w:i", namespaces=namespaces) is not None
+                if bold:
+                    text = f"**{text}**"
+                if italic:
+                    text = f"*{text}*"
+                texts.append(text)
+        return "".join(texts).strip()
+
+    def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
+        rows = table_elem.findall(".//w:tr", namespaces=namespaces)
+        if not rows:
+            return ""
+        rows_data = []
+        for row in rows:
+            cells = row.findall(".//w:tc", namespaces=namespaces)
+            cell_texts = []
+            for cell in cells:
+                cell_text = extract_text_with_formatting(cell, namespaces)
+                cell_text = cell_text.replace("\n", " ").strip()
+                cell_texts.append(cell_text if cell_text else "")
+            if cell_texts:
+                rows_data.append(cell_texts)
+        return build_markdown_table(rows_data)
+
+    try:
+        style_to_level = {}
+        style_to_list = {}
+        markdown_lines = []
+
+        with zipfile.ZipFile(file_path) as zip_file:
+            try:
+                styles_file = safe_open_zip(zip_file, "word/styles.xml")
+                if styles_file:
+                    styles_root = ET.parse(styles_file).getroot()
+                    for style in styles_root.findall(
+                        ".//w:style", namespaces=namespaces
+                    ):
+                        style_id = style.get(f"{{{word_namespace}}}styleId")
+                        style_name_elem = style.find("w:name", namespaces=namespaces)
+                        if style_id and style_name_elem is not None:
+                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
+                            if style_name:
+                                style_name_lower = style_name.lower()
+                                if style_name_lower in _STYLE_NAME_TO_HEADING:
+                                    style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
+                                elif (
+                                    style_name_lower.startswith("list bullet")
+                                    or style_name_lower == "bullet"
+                                ):
+                                    style_to_list[style_id] = "bullet"
+                                elif (
+                                    style_name_lower.startswith("list number")
+                                    or style_name_lower == "number"
+                                ):
+                                    style_to_list[style_id] = "number"
+            except Exception:
+                pass
+
+            document_file = safe_open_zip(zip_file, "word/document.xml")
+            if not document_file:
+                return None, "document.xml 不存在或无法访问"
+
+            root = ET.parse(document_file).getroot()
+            body = root.find(".//w:body", namespaces=namespaces)
+            if body is None:
+                return None, "document.xml 中未找到 w:body 元素"
+
+            for child in body.findall("./*", namespaces=namespaces):
+                if child.tag.endswith("}p"):
+                    style_elem = child.find(".//w:pStyle", namespaces=namespaces)
+                    style_id = (
+                        style_elem.get(f"{{{word_namespace}}}val")
+                        if style_elem is not None
+                        else None
+                    )
+
+                    heading_level = get_heading_level(style_id, style_to_level)
+                    list_style = get_list_style(style_id, style_to_list)
+                    para_text = extract_text_with_formatting(child, namespaces)
+
+                    if para_text:
+                        if heading_level > 0:
+                            markdown_lines.append(f"{'#' * heading_level} {para_text}")
+                        elif list_style == "bullet":
+                            markdown_lines.append(f"- {para_text}")
+                        elif list_style == "number":
+                            markdown_lines.append(f"1. {para_text}")
+                        else:
+                            markdown_lines.append(para_text)
+                        markdown_lines.append("")
+
+                elif child.tag.endswith("}tbl"):
+                    table_md = convert_table_to_markdown(child, namespaces)
+                    if table_md:
+                        markdown_lines.append(table_md)
+                        markdown_lines.append("")
+
+        content = "\n".join(markdown_lines)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"XML 解析失败: {str(e)}"