1
0

增加unstructured处理策略

This commit is contained in:
2026-02-17 20:12:26 +08:00
parent 856700fbe0
commit c693e23888
7 changed files with 603 additions and 730 deletions

View File

@@ -6,6 +6,7 @@ import zipfile
from typing import Any, List, Optional, Tuple
from common import (
_unstructured_elements_to_markdown,
build_markdown_table,
parse_with_docling,
parse_with_markitdown,
@@ -18,6 +19,23 @@ def parse_docx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str
return parse_with_docling(file_path)
def parse_docx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 unstructured 库解析 DOCX 文件"""
try:
from unstructured.partition.docx import partition_docx
except ImportError:
return None, "unstructured 库未安装"
try:
elements = partition_docx(filename=file_path, infer_table_structure=True)
content = _unstructured_elements_to_markdown(elements)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"unstructured 解析失败: {str(e)}"
def parse_docx_with_pypandoc(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 pypandoc-binary 库解析 DOCX 文件。"""
try:
@@ -59,32 +77,29 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
try:
doc = Document(file_path)
_HEADING_LEVELS = {
"Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
"Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
}
def get_heading_level(para: Any) -> int:
if para.style and para.style.name:
style_name = para.style.name
if style_name == "Title":
return 1
elif style_name == "Heading 1":
return 1
elif style_name == "Heading 2":
return 2
elif style_name == "Heading 3":
return 3
elif style_name == "Heading 4":
return 4
elif style_name == "Heading 5":
return 5
elif style_name == "Heading 6":
return 6
return _HEADING_LEVELS.get(para.style.name, 0)
return 0
_LIST_STYLES = {
"Bullet": "bullet", "Number": "number",
}
def get_list_style(para: Any) -> Optional[str]:
if not para.style or not para.style.name:
return None
style_name = para.style.name
if style_name.startswith("List Bullet") or style_name == "Bullet":
if style_name in _LIST_STYLES:
return _LIST_STYLES[style_name]
if style_name.startswith("List Bullet"):
return "bullet"
elif style_name.startswith("List Number") or style_name == "Number":
if style_name.startswith("List Number"):
return "number"
return None
@@ -170,6 +185,11 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
namespaces = {"w": word_namespace}
_STYLE_NAME_TO_HEADING = {
"title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
"heading 4": 4, "heading 5": 5, "heading 6": 6,
}
def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
return style_to_level.get(style_id, 0)
@@ -195,8 +215,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
rows = table_elem.findall(".//w:tr", namespaces=namespaces)
if not rows:
return ""
md_lines = []
for i, row in enumerate(rows):
rows_data = []
for row in rows:
cells = row.findall(".//w:tc", namespaces=namespaces)
cell_texts = []
for cell in cells:
@@ -204,12 +224,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
cell_text = cell_text.replace("\n", " ").strip()
cell_texts.append(cell_text if cell_text else "")
if cell_texts:
md_line = "| " + " | ".join(cell_texts) + " |"
md_lines.append(md_line)
if i == 0:
sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
md_lines.append(sep_line)
return "\n".join(md_lines)
rows_data.append(cell_texts)
return build_markdown_table(rows_data)
try:
style_to_level = {}
@@ -230,20 +246,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
if style_name:
style_name_lower = style_name.lower()
if style_name_lower == "title":
style_to_level[style_id] = 1
elif style_name_lower == "heading 1":
style_to_level[style_id] = 1
elif style_name_lower == "heading 2":
style_to_level[style_id] = 2
elif style_name_lower == "heading 3":
style_to_level[style_id] = 3
elif style_name_lower == "heading 4":
style_to_level[style_id] = 4
elif style_name_lower == "heading 5":
style_to_level[style_id] = 5
elif style_name_lower == "heading 6":
style_to_level[style_id] = 6
if style_name_lower in _STYLE_NAME_TO_HEADING:
style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
elif (
style_name_lower.startswith("list bullet")
or style_name_lower == "bullet"