增加unstructured处理策略

2026-02-17 20:12:26 +08:00
parent 856700fbe0
commit c693e23888
7 changed files with 603 additions and 730 deletions
--- a/temp/scripts/pptx_parser.py
+++ b/temp/scripts/pptx_parser.py
@@ -7,6 +7,7 @@ import zipfile
 from typing import Any, List, Optional, Tuple

 from common import (
+    _unstructured_elements_to_markdown,
    build_markdown_table,
    flush_list_stack,
    parse_with_docling,
@@ -19,6 +20,25 @@ def parse_pptx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str
    return parse_with_docling(file_path)


+def parse_pptx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 unstructured 库解析 PPTX 文件"""
+    try:
+        from unstructured.partition.pptx import partition_pptx
+    except ImportError:
+        return None, "unstructured 库未安装"
+
+    try:
+        elements = partition_pptx(
+            filename=file_path, infer_table_structure=True, include_metadata=True
+        )
+        content = _unstructured_elements_to_markdown(elements)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"unstructured 解析失败: {str(e)}"
+
+
 def parse_pptx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 MarkItDown 库解析 PPTX 文件"""
    return parse_with_markitdown(file_path)
@@ -74,6 +94,8 @@ def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional
    except ImportError:
        return None, "python-pptx 库未安装"

+    _A_NS = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
+
    try:
        prs = Presentation(file_path)
        md_content = []
@@ -89,10 +111,7 @@ def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional

                if hasattr(shape, "has_table") and shape.has_table:
                    if list_stack:
-                        md_content.append(
-                            "\n" + "\n".join([x for x in list_stack if x]) + "\n"
-                        )
-                        list_stack.clear()
+                        flush_list_stack(list_stack, md_content)

                    table_md = convert_table_to_md_pptx(shape.table)
                    md_content.append(table_md)
@@ -104,20 +123,8 @@ def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional
                        if pPr is not None:
                            is_list = (
                                para.level > 0
-                                or pPr.find(
-                                    ".//a:buChar",
-                                    namespaces={
-                                        "a": "http://schemas.openxmlformats.org/drawingml/2006/main"
-                                    },
-                                )
-                                is not None
-                                or pPr.find(
-                                    ".//a:buAutoNum",
-                                    namespaces={
-                                        "a": "http://schemas.openxmlformats.org/drawingml/2006/main"
-                                    },
-                                )
-                                is not None
+                                or pPr.find(".//a:buChar", namespaces=_A_NS) is not None
+                                or pPr.find(".//a:buAutoNum", namespaces=_A_NS) is not None
                            )

                        if is_list:
@@ -128,16 +135,9 @@ def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional

                            text = extract_formatted_text_pptx(para.runs)
                            if text:
-                                pPr = para._element.pPr
                                is_ordered = (
                                    pPr is not None
-                                    and pPr.find(
-                                        ".//a:buAutoNum",
-                                        namespaces={
-                                            "a": "http://schemas.openxmlformats.org/drawingml/2006/main"
-                                        },
-                                    )
-                                    is not None
+                                    and pPr.find(".//a:buAutoNum", namespaces=_A_NS) is not None
                                )
                                marker = "1. " if is_ordered else "- "
                                indent = "  " * level
@@ -149,20 +149,14 @@ def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional
                                        list_stack[i] = ""
                        else:
                            if list_stack:
-                                md_content.append(
-                                    "\n"
-                                    + "\n".join([x for x in list_stack if x])
-                                    + "\n"
-                                )
-                                list_stack.clear()
+                                flush_list_stack(list_stack, md_content)

                            text = extract_formatted_text_pptx(para.runs)
                            if text:
                                md_content.append(f"{text}\n")

            if list_stack:
-                md_content.append("\n" + "\n".join([x for x in list_stack if x]) + "\n")
-                list_stack.clear()
+                flush_list_stack(list_stack, md_content)

            md_content.append("---\n")