1
0
Files
Skill/temp/scripts/xlsx_parser.py

287 lines
11 KiB
Python

#!/usr/bin/env python3
"""XLSX 文件解析模块,提供三种解析方法。"""
import xml.etree.ElementTree as ET
import zipfile
from typing import List, Optional, Tuple
from common import _unstructured_elements_to_markdown, parse_with_docling, parse_with_markitdown
def parse_xlsx_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 docling 库解析 XLSX 文件"""
return parse_with_docling(file_path)
def parse_xlsx_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 unstructured 库解析 XLSX 文件"""
try:
from unstructured.partition.xlsx import partition_xlsx
except ImportError:
return None, "unstructured 库未安装"
try:
elements = partition_xlsx(filename=file_path, infer_table_structure=True)
content = _unstructured_elements_to_markdown(elements)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"unstructured 解析失败: {str(e)}"
def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 库解析 XLSX 文件"""
return parse_with_markitdown(file_path)
def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 pandas 库解析 XLSX 文件"""
try:
import pandas as pd
from tabulate import tabulate
except ImportError as e:
missing_lib = "pandas" if "pandas" in str(e) else "tabulate"
return None, f"{missing_lib} 库未安装"
try:
sheets = pd.read_excel(file_path, sheet_name=None)
markdown_parts = []
for sheet_name, df in sheets.items():
if len(df) == 0:
markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*")
continue
table_md = tabulate(
df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
)
markdown_parts.append(f"## {sheet_name}\n\n{table_md}")
if not markdown_parts:
return None, "Excel 文件为空"
markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts)
return markdown_content, None
except Exception as e:
return None, f"pandas 解析失败: {str(e)}"
def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 XML 原生解析 XLSX 文件"""
xlsx_namespace = {
"main": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
}
def parse_col_index(cell_ref: str) -> int:
col_index = 0
for char in cell_ref:
if char.isalpha():
col_index = col_index * 26 + (ord(char) - ord("A") + 1)
else:
break
return col_index - 1
def parse_cell_value(cell: ET.Element, shared_strings: List[str]) -> str:
cell_type = cell.attrib.get("t")
if cell_type == "inlineStr":
is_elem = cell.find("main:is", xlsx_namespace)
if is_elem is not None:
t_elem = is_elem.find("main:t", xlsx_namespace)
if t_elem is not None and t_elem.text:
return t_elem.text.replace("\n", " ").replace("\r", "")
return ""
cell_value_elem = cell.find("main:v", xlsx_namespace)
if cell_value_elem is None or not cell_value_elem.text:
return ""
cell_value = cell_value_elem.text
if cell_type == "s":
try:
idx = int(cell_value)
if 0 <= idx < len(shared_strings):
text = shared_strings[idx]
return text.replace("\n", " ").replace("\r", "")
except (ValueError, IndexError):
pass
return ""
elif cell_type == "b":
return "TRUE" if cell_value == "1" else "FALSE"
elif cell_type == "str":
return cell_value.replace("\n", " ").replace("\r", "")
elif cell_type == "e":
_ERROR_CODES = {
"#NULL!": "空引用错误",
"#DIV/0!": "除零错误",
"#VALUE!": "值类型错误",
"#REF!": "无效引用",
"#NAME?": "名称错误",
"#NUM!": "数值错误",
"#N/A": "值不可用",
}
return _ERROR_CODES.get(cell_value, f"错误: {cell_value}")
elif cell_type == "d":
return f"[日期] {cell_value}"
elif cell_type == "n":
return cell_value
elif cell_type is None:
try:
float_val = float(cell_value)
if float_val.is_integer():
return str(int(float_val))
return cell_value
except ValueError:
return cell_value
else:
return cell_value
def get_non_empty_columns(data: List[List[str]]) -> set:
non_empty_cols = set()
for row in data:
for col_idx, cell in enumerate(row):
if cell and cell.strip():
non_empty_cols.add(col_idx)
return non_empty_cols
def filter_columns(row: List[str], non_empty_cols: set) -> List[str]:
return [row[i] if i < len(row) else "" for i in sorted(non_empty_cols)]
def data_to_markdown(data: List[List[str]], sheet_name: str) -> str:
if not data or not data[0]:
return f"## {sheet_name}\n\n*工作表为空*"
md_lines = []
md_lines.append(f"## {sheet_name}")
md_lines.append("")
headers = data[0]
non_empty_cols = get_non_empty_columns(data)
if not non_empty_cols:
return f"## {sheet_name}\n\n*工作表为空*"
filtered_headers = filter_columns(headers, non_empty_cols)
header_line = "| " + " | ".join(filtered_headers) + " |"
md_lines.append(header_line)
separator_line = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
md_lines.append(separator_line)
for row in data[1:]:
filtered_row = filter_columns(row, non_empty_cols)
row_line = "| " + " | ".join(filtered_row) + " |"
md_lines.append(row_line)
md_lines.append("")
return "\n".join(md_lines)
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
sheet_names = []
sheet_rids = []
try:
with zip_file.open("xl/workbook.xml") as f:
root = ET.parse(f).getroot()
rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
sheet_elements = root.findall(".//main:sheet", xlsx_namespace)
for sheet in sheet_elements:
sheet_name = sheet.attrib.get("name", "")
rid = sheet.attrib.get(f"{{{rel_ns}}}id", "")
if sheet_name:
sheet_names.append(sheet_name)
sheet_rids.append(rid)
except KeyError:
return None, "无法解析工作表名称"
if not sheet_names:
return None, "未找到工作表"
rid_to_target = {}
try:
rels_ns = "http://schemas.openxmlformats.org/package/2006/relationships"
with zip_file.open("xl/_rels/workbook.xml.rels") as f:
rels_root = ET.parse(f).getroot()
for rel in rels_root.findall(f"{{{rels_ns}}}Relationship"):
rid = rel.attrib.get("Id", "")
target = rel.attrib.get("Target", "")
if rid and target:
rid_to_target[rid] = target
except KeyError:
pass
shared_strings = []
try:
with zip_file.open("xl/sharedStrings.xml") as f:
root = ET.parse(f).getroot()
for si in root.findall(".//main:si", xlsx_namespace):
t_elem = si.find(".//main:t", xlsx_namespace)
if t_elem is not None and t_elem.text:
shared_strings.append(t_elem.text)
else:
shared_strings.append("")
except KeyError:
pass
markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n"
for sheet_index, sheet_name in enumerate(sheet_names):
rid = sheet_rids[sheet_index] if sheet_index < len(sheet_rids) else ""
target = rid_to_target.get(rid, "")
if target:
if target.startswith("/"):
worksheet_path = target.lstrip("/")
else:
worksheet_path = f"xl/{target}"
else:
worksheet_path = f"xl/worksheets/sheet{sheet_index + 1}.xml"
try:
with zip_file.open(worksheet_path) as f:
root = ET.parse(f).getroot()
sheet_data = root.find("main:sheetData", xlsx_namespace)
rows = []
if sheet_data is not None:
row_elements = sheet_data.findall(
"main:row", xlsx_namespace
)
for row_elem in row_elements:
cells = row_elem.findall("main:c", xlsx_namespace)
col_dict = {}
for cell in cells:
cell_ref = cell.attrib.get("r", "")
if not cell_ref:
continue
col_index = parse_col_index(cell_ref)
cell_value = parse_cell_value(cell, shared_strings)
col_dict[col_index] = cell_value
if col_dict:
max_col = max(col_dict.keys())
row_data = [
col_dict.get(i, "") for i in range(max_col + 1)
]
rows.append(row_data)
table_md = data_to_markdown(rows, sheet_name)
markdown_content += table_md + "\n\n"
except KeyError:
markdown_content += f"## {sheet_name}\n\n*工作表解析失败*\n\n"
if not markdown_content.strip():
return None, "解析结果为空"
return markdown_content, None
except Exception as e:
return None, f"XML 解析失败: {str(e)}"