增加unstructured处理策略

2026-02-17 20:12:26 +08:00
parent 856700fbe0
commit c693e23888
7 changed files with 603 additions and 730 deletions
--- a/temp/scripts/docx_parser.py
+++ b/temp/scripts/docx_parser.py
@@ -6,6 +6,7 @@ import zipfile
 from typing import Any, List, Optional, Tuple

 from common import (
+    _unstructured_elements_to_markdown,
    build_markdown_table,
    parse_with_docling,
    parse_with_markitdown,
@@ -18,6 +19,23 @@ def parse_docx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str
    return parse_with_docling(file_path)


+def parse_docx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 unstructured 库解析 DOCX 文件"""
+    try:
+        from unstructured.partition.docx import partition_docx
+    except ImportError:
+        return None, "unstructured 库未安装"
+
+    try:
+        elements = partition_docx(filename=file_path, infer_table_structure=True)
+        content = _unstructured_elements_to_markdown(elements)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"unstructured 解析失败: {str(e)}"
+
+
 def parse_docx_with_pypandoc(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 pypandoc-binary 库解析 DOCX 文件。"""
    try:
@@ -59,32 +77,29 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
    try:
        doc = Document(file_path)

+        _HEADING_LEVELS = {
+            "Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
+            "Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
+        }
+
        def get_heading_level(para: Any) -> int:
            if para.style and para.style.name:
-                style_name = para.style.name
-                if style_name == "Title":
-                    return 1
-                elif style_name == "Heading 1":
-                    return 1
-                elif style_name == "Heading 2":
-                    return 2
-                elif style_name == "Heading 3":
-                    return 3
-                elif style_name == "Heading 4":
-                    return 4
-                elif style_name == "Heading 5":
-                    return 5
-                elif style_name == "Heading 6":
-                    return 6
+                return _HEADING_LEVELS.get(para.style.name, 0)
            return 0

+        _LIST_STYLES = {
+            "Bullet": "bullet", "Number": "number",
+        }
+
        def get_list_style(para: Any) -> Optional[str]:
            if not para.style or not para.style.name:
                return None
            style_name = para.style.name
-            if style_name.startswith("List Bullet") or style_name == "Bullet":
+            if style_name in _LIST_STYLES:
+                return _LIST_STYLES[style_name]
+            if style_name.startswith("List Bullet"):
                return "bullet"
-            elif style_name.startswith("List Number") or style_name == "Number":
+            if style_name.startswith("List Number"):
                return "number"
            return None

@@ -170,6 +185,11 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    namespaces = {"w": word_namespace}

+    _STYLE_NAME_TO_HEADING = {
+        "title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
+        "heading 4": 4, "heading 5": 5, "heading 6": 6,
+    }
+
    def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
        return style_to_level.get(style_id, 0)

@@ -195,8 +215,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
        rows = table_elem.findall(".//w:tr", namespaces=namespaces)
        if not rows:
            return ""
-        md_lines = []
-        for i, row in enumerate(rows):
+        rows_data = []
+        for row in rows:
            cells = row.findall(".//w:tc", namespaces=namespaces)
            cell_texts = []
            for cell in cells:
@@ -204,12 +224,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                cell_text = cell_text.replace("\n", " ").strip()
                cell_texts.append(cell_text if cell_text else "")
            if cell_texts:
-                md_line = "| " + " | ".join(cell_texts) + " |"
-                md_lines.append(md_line)
-                if i == 0:
-                    sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
-                    md_lines.append(sep_line)
-        return "\n".join(md_lines)
+                rows_data.append(cell_texts)
+        return build_markdown_table(rows_data)

    try:
        style_to_level = {}
@@ -230,20 +246,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
                            if style_name:
                                style_name_lower = style_name.lower()
-                                if style_name_lower == "title":
-                                    style_to_level[style_id] = 1
-                                elif style_name_lower == "heading 1":
-                                    style_to_level[style_id] = 1
-                                elif style_name_lower == "heading 2":
-                                    style_to_level[style_id] = 2
-                                elif style_name_lower == "heading 3":
-                                    style_to_level[style_id] = 3
-                                elif style_name_lower == "heading 4":
-                                    style_to_level[style_id] = 4
-                                elif style_name_lower == "heading 5":
-                                    style_to_level[style_id] = 5
-                                elif style_name_lower == "heading 6":
-                                    style_to_level[style_id] = 6
+                                if style_name_lower in _STYLE_NAME_TO_HEADING:
+                                    style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
                                elif (
                                    style_name_lower.startswith("list bullet")
                                    or style_name_lower == "bullet"