增加unstructured处理策略
This commit is contained in:
@@ -6,6 +6,7 @@ import zipfile
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from common import (
|
||||
_unstructured_elements_to_markdown,
|
||||
build_markdown_table,
|
||||
parse_with_docling,
|
||||
parse_with_markitdown,
|
||||
@@ -18,6 +19,23 @@ def parse_docx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str
|
||||
return parse_with_docling(file_path)
|
||||
|
||||
|
||||
def parse_docx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 unstructured 库解析 DOCX 文件"""
|
||||
try:
|
||||
from unstructured.partition.docx import partition_docx
|
||||
except ImportError:
|
||||
return None, "unstructured 库未安装"
|
||||
|
||||
try:
|
||||
elements = partition_docx(filename=file_path, infer_table_structure=True)
|
||||
content = _unstructured_elements_to_markdown(elements)
|
||||
if not content.strip():
|
||||
return None, "文档为空"
|
||||
return content, None
|
||||
except Exception as e:
|
||||
return None, f"unstructured 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_docx_with_pypandoc(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 pypandoc-binary 库解析 DOCX 文件。"""
|
||||
try:
|
||||
@@ -59,32 +77,29 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
|
||||
_HEADING_LEVELS = {
|
||||
"Title": 1, "Heading 1": 1, "Heading 2": 2, "Heading 3": 3,
|
||||
"Heading 4": 4, "Heading 5": 5, "Heading 6": 6,
|
||||
}
|
||||
|
||||
def get_heading_level(para: Any) -> int:
|
||||
if para.style and para.style.name:
|
||||
style_name = para.style.name
|
||||
if style_name == "Title":
|
||||
return 1
|
||||
elif style_name == "Heading 1":
|
||||
return 1
|
||||
elif style_name == "Heading 2":
|
||||
return 2
|
||||
elif style_name == "Heading 3":
|
||||
return 3
|
||||
elif style_name == "Heading 4":
|
||||
return 4
|
||||
elif style_name == "Heading 5":
|
||||
return 5
|
||||
elif style_name == "Heading 6":
|
||||
return 6
|
||||
return _HEADING_LEVELS.get(para.style.name, 0)
|
||||
return 0
|
||||
|
||||
_LIST_STYLES = {
|
||||
"Bullet": "bullet", "Number": "number",
|
||||
}
|
||||
|
||||
def get_list_style(para: Any) -> Optional[str]:
|
||||
if not para.style or not para.style.name:
|
||||
return None
|
||||
style_name = para.style.name
|
||||
if style_name.startswith("List Bullet") or style_name == "Bullet":
|
||||
if style_name in _LIST_STYLES:
|
||||
return _LIST_STYLES[style_name]
|
||||
if style_name.startswith("List Bullet"):
|
||||
return "bullet"
|
||||
elif style_name.startswith("List Number") or style_name == "Number":
|
||||
if style_name.startswith("List Number"):
|
||||
return "number"
|
||||
return None
|
||||
|
||||
@@ -170,6 +185,11 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
namespaces = {"w": word_namespace}
|
||||
|
||||
_STYLE_NAME_TO_HEADING = {
|
||||
"title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
|
||||
"heading 4": 4, "heading 5": 5, "heading 6": 6,
|
||||
}
|
||||
|
||||
def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
|
||||
return style_to_level.get(style_id, 0)
|
||||
|
||||
@@ -195,8 +215,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
rows = table_elem.findall(".//w:tr", namespaces=namespaces)
|
||||
if not rows:
|
||||
return ""
|
||||
md_lines = []
|
||||
for i, row in enumerate(rows):
|
||||
rows_data = []
|
||||
for row in rows:
|
||||
cells = row.findall(".//w:tc", namespaces=namespaces)
|
||||
cell_texts = []
|
||||
for cell in cells:
|
||||
@@ -204,12 +224,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
cell_text = cell_text.replace("\n", " ").strip()
|
||||
cell_texts.append(cell_text if cell_text else "")
|
||||
if cell_texts:
|
||||
md_line = "| " + " | ".join(cell_texts) + " |"
|
||||
md_lines.append(md_line)
|
||||
if i == 0:
|
||||
sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
|
||||
md_lines.append(sep_line)
|
||||
return "\n".join(md_lines)
|
||||
rows_data.append(cell_texts)
|
||||
return build_markdown_table(rows_data)
|
||||
|
||||
try:
|
||||
style_to_level = {}
|
||||
@@ -230,20 +246,8 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
|
||||
if style_name:
|
||||
style_name_lower = style_name.lower()
|
||||
if style_name_lower == "title":
|
||||
style_to_level[style_id] = 1
|
||||
elif style_name_lower == "heading 1":
|
||||
style_to_level[style_id] = 1
|
||||
elif style_name_lower == "heading 2":
|
||||
style_to_level[style_id] = 2
|
||||
elif style_name_lower == "heading 3":
|
||||
style_to_level[style_id] = 3
|
||||
elif style_name_lower == "heading 4":
|
||||
style_to_level[style_id] = 4
|
||||
elif style_name_lower == "heading 5":
|
||||
style_to_level[style_id] = 5
|
||||
elif style_name_lower == "heading 6":
|
||||
style_to_level[style_id] = 6
|
||||
if style_name_lower in _STYLE_NAME_TO_HEADING:
|
||||
style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
|
||||
elif (
|
||||
style_name_lower.startswith("list bullet")
|
||||
or style_name_lower == "bullet"
|
||||
|
||||
Reference in New Issue
Block a user