增加unstructured处理策略
This commit is contained in:
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from common import parse_with_docling, parse_with_markitdown
|
||||
from common import _unstructured_elements_to_markdown, parse_with_docling, parse_with_markitdown
|
||||
|
||||
|
||||
def parse_xlsx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
@@ -13,6 +13,23 @@ def parse_xlsx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str
|
||||
return parse_with_docling(file_path)
|
||||
|
||||
|
||||
def parse_xlsx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 unstructured 库解析 XLSX 文件"""
|
||||
try:
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
except ImportError:
|
||||
return None, "unstructured 库未安装"
|
||||
|
||||
try:
|
||||
elements = partition_xlsx(filename=file_path, infer_table_structure=True)
|
||||
content = _unstructured_elements_to_markdown(elements)
|
||||
if not content.strip():
|
||||
return None, "文档为空"
|
||||
return content, None
|
||||
except Exception as e:
|
||||
return None, f"unstructured 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析 XLSX 文件"""
|
||||
return parse_with_markitdown(file_path)
|
||||
@@ -68,58 +85,59 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
def parse_cell_value(cell: ET.Element, shared_strings: List[str]) -> str:
|
||||
cell_type = cell.attrib.get("t")
|
||||
|
||||
if cell_type == "inlineStr":
|
||||
is_elem = cell.find("main:is", xlsx_namespace)
|
||||
if is_elem is not None:
|
||||
t_elem = is_elem.find("main:t", xlsx_namespace)
|
||||
if t_elem is not None and t_elem.text:
|
||||
return t_elem.text.replace("\n", " ").replace("\r", "")
|
||||
return ""
|
||||
|
||||
cell_value_elem = cell.find("main:v", xlsx_namespace)
|
||||
if cell_value_elem is None or not cell_value_elem.text:
|
||||
return ""
|
||||
|
||||
if cell_value_elem is not None and cell_value_elem.text:
|
||||
cell_value = cell_value_elem.text
|
||||
cell_value = cell_value_elem.text
|
||||
|
||||
if cell_type == "s":
|
||||
try:
|
||||
idx = int(cell_value)
|
||||
if 0 <= idx < len(shared_strings):
|
||||
text = shared_strings[idx]
|
||||
return text.replace("\n", " ").replace("\r", "")
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
return ""
|
||||
elif cell_type == "b":
|
||||
return "TRUE" if cell_value == "1" else "FALSE"
|
||||
elif cell_type == "str":
|
||||
return cell_value.replace("\n", " ").replace("\r", "")
|
||||
elif cell_type == "inlineStr":
|
||||
is_elem = cell.find("main:is", xlsx_namespace)
|
||||
if is_elem is not None:
|
||||
t_elem = is_elem.find("main:t", xlsx_namespace)
|
||||
if t_elem is not None and t_elem.text:
|
||||
return t_elem.text.replace("\n", " ").replace("\r", "")
|
||||
return ""
|
||||
elif cell_type == "e":
|
||||
error_codes = {
|
||||
"#NULL!": "空引用错误",
|
||||
"#DIV/0!": "除零错误",
|
||||
"#VALUE!": "值类型错误",
|
||||
"#REF!": "无效引用",
|
||||
"#NAME?": "名称错误",
|
||||
"#NUM!": "数值错误",
|
||||
"#N/A": "值不可用",
|
||||
}
|
||||
return error_codes.get(cell_value, f"错误: {cell_value}")
|
||||
elif cell_type == "d":
|
||||
return f"[日期] {cell_value}"
|
||||
elif cell_type == "n":
|
||||
if cell_type == "s":
|
||||
try:
|
||||
idx = int(cell_value)
|
||||
if 0 <= idx < len(shared_strings):
|
||||
text = shared_strings[idx]
|
||||
return text.replace("\n", " ").replace("\r", "")
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
return ""
|
||||
elif cell_type == "b":
|
||||
return "TRUE" if cell_value == "1" else "FALSE"
|
||||
elif cell_type == "str":
|
||||
return cell_value.replace("\n", " ").replace("\r", "")
|
||||
elif cell_type == "e":
|
||||
_ERROR_CODES = {
|
||||
"#NULL!": "空引用错误",
|
||||
"#DIV/0!": "除零错误",
|
||||
"#VALUE!": "值类型错误",
|
||||
"#REF!": "无效引用",
|
||||
"#NAME?": "名称错误",
|
||||
"#NUM!": "数值错误",
|
||||
"#N/A": "值不可用",
|
||||
}
|
||||
return _ERROR_CODES.get(cell_value, f"错误: {cell_value}")
|
||||
elif cell_type == "d":
|
||||
return f"[日期] {cell_value}"
|
||||
elif cell_type == "n":
|
||||
return cell_value
|
||||
elif cell_type is None:
|
||||
try:
|
||||
float_val = float(cell_value)
|
||||
if float_val.is_integer():
|
||||
return str(int(float_val))
|
||||
return cell_value
|
||||
elif cell_type is None:
|
||||
try:
|
||||
float_val = float(cell_value)
|
||||
if float_val.is_integer():
|
||||
return str(int(float_val))
|
||||
return cell_value
|
||||
except ValueError:
|
||||
return cell_value
|
||||
else:
|
||||
except ValueError:
|
||||
return cell_value
|
||||
else:
|
||||
return ""
|
||||
return cell_value
|
||||
|
||||
def get_non_empty_columns(data: List[List[str]]) -> set:
|
||||
non_empty_cols = set()
|
||||
|
||||
Reference in New Issue
Block a user