增加unstructured处理策略

2026-02-17 20:12:26 +08:00
parent 856700fbe0
commit c693e23888
7 changed files with 603 additions and 730 deletions
--- a/temp/scripts/xlsx_parser.py
+++ b/temp/scripts/xlsx_parser.py
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
 import zipfile
 from typing import List, Optional, Tuple

-from common import parse_with_docling, parse_with_markitdown
+from common import _unstructured_elements_to_markdown, parse_with_docling, parse_with_markitdown


 def parse_xlsx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
@@ -13,6 +13,23 @@ def parse_xlsx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str
    return parse_with_docling(file_path)


+def parse_xlsx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
+    """使用 unstructured 库解析 XLSX 文件"""
+    try:
+        from unstructured.partition.xlsx import partition_xlsx
+    except ImportError:
+        return None, "unstructured 库未安装"
+
+    try:
+        elements = partition_xlsx(filename=file_path, infer_table_structure=True)
+        content = _unstructured_elements_to_markdown(elements)
+        if not content.strip():
+            return None, "文档为空"
+        return content, None
+    except Exception as e:
+        return None, f"unstructured 解析失败: {str(e)}"
+
+
 def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 MarkItDown 库解析 XLSX 文件"""
    return parse_with_markitdown(file_path)
@@ -68,58 +85,59 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:

    def parse_cell_value(cell: ET.Element, shared_strings: List[str]) -> str:
        cell_type = cell.attrib.get("t")
+
+        if cell_type == "inlineStr":
+            is_elem = cell.find("main:is", xlsx_namespace)
+            if is_elem is not None:
+                t_elem = is_elem.find("main:t", xlsx_namespace)
+                if t_elem is not None and t_elem.text:
+                    return t_elem.text.replace("\n", " ").replace("\r", "")
+            return ""
+
        cell_value_elem = cell.find("main:v", xlsx_namespace)
+        if cell_value_elem is None or not cell_value_elem.text:
+            return ""

-        if cell_value_elem is not None and cell_value_elem.text:
-            cell_value = cell_value_elem.text
+        cell_value = cell_value_elem.text

-            if cell_type == "s":
-                try:
-                    idx = int(cell_value)
-                    if 0 <= idx < len(shared_strings):
-                        text = shared_strings[idx]
-                        return text.replace("\n", " ").replace("\r", "")
-                except (ValueError, IndexError):
-                    pass
-                return ""
-            elif cell_type == "b":
-                return "TRUE" if cell_value == "1" else "FALSE"
-            elif cell_type == "str":
-                return cell_value.replace("\n", " ").replace("\r", "")
-            elif cell_type == "inlineStr":
-                is_elem = cell.find("main:is", xlsx_namespace)
-                if is_elem is not None:
-                    t_elem = is_elem.find("main:t", xlsx_namespace)
-                    if t_elem is not None and t_elem.text:
-                        return t_elem.text.replace("\n", " ").replace("\r", "")
-                return ""
-            elif cell_type == "e":
-                error_codes = {
-                    "#NULL!": "空引用错误",
-                    "#DIV/0!": "除零错误",
-                    "#VALUE!": "值类型错误",
-                    "#REF!": "无效引用",
-                    "#NAME?": "名称错误",
-                    "#NUM!": "数值错误",
-                    "#N/A": "值不可用",
-                }
-                return error_codes.get(cell_value, f"错误: {cell_value}")
-            elif cell_type == "d":
-                return f"[日期] {cell_value}"
-            elif cell_type == "n":
+        if cell_type == "s":
+            try:
+                idx = int(cell_value)
+                if 0 <= idx < len(shared_strings):
+                    text = shared_strings[idx]
+                    return text.replace("\n", " ").replace("\r", "")
+            except (ValueError, IndexError):
+                pass
+            return ""
+        elif cell_type == "b":
+            return "TRUE" if cell_value == "1" else "FALSE"
+        elif cell_type == "str":
+            return cell_value.replace("\n", " ").replace("\r", "")
+        elif cell_type == "e":
+            _ERROR_CODES = {
+                "#NULL!": "空引用错误",
+                "#DIV/0!": "除零错误",
+                "#VALUE!": "值类型错误",
+                "#REF!": "无效引用",
+                "#NAME?": "名称错误",
+                "#NUM!": "数值错误",
+                "#N/A": "值不可用",
+            }
+            return _ERROR_CODES.get(cell_value, f"错误: {cell_value}")
+        elif cell_type == "d":
+            return f"[日期] {cell_value}"
+        elif cell_type == "n":
+            return cell_value
+        elif cell_type is None:
+            try:
+                float_val = float(cell_value)
+                if float_val.is_integer():
+                    return str(int(float_val))
                return cell_value
-            elif cell_type is None:
-                try:
-                    float_val = float(cell_value)
-                    if float_val.is_integer():
-                        return str(int(float_val))
-                    return cell_value
-                except ValueError:
-                    return cell_value
-            else:
+            except ValueError:
                return cell_value
        else:
-            return ""
+            return cell_value

    def get_non_empty_columns(data: List[List[str]]) -> set:
        non_empty_cols = set()