1
0
Files
Skill/temp/document_parser.py

1246 lines
44 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""整合的文档解析器,支持 DOCX、PPTX 和 XLSX 文件:
按优先级尝试多种解析方法:
1. MarkItDown (微软官方库)
2. python-docx、python-pptx 或 pandas (成熟的 Python 库)
3. XML 原生解析 (备选方案)
代码风格要求:
- Python 3.6+ 兼容
- 遵循 PEP 8 规范
- 所有公共 API 函数添加类型提示
- 字符串优先内联使用不提取为常量除非被使用超过3次
- 其他被多次使用的对象根据具体情况可考虑被提取为常量(如正则表达式)
- 模块级和公共 API 函数保留文档字符串
- 内部辅助函数不添加文档字符串(函数名足够描述)
- 变量命名清晰,避免单字母变量名
"""
import argparse
import os
import re
import sys
import zipfile
import xml.etree.ElementTree as ET
from typing import Any, List, Optional, Tuple
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
MEDIA_LINK_PATTERN = re.compile(
r'^\[.*?\]\(.*\.(png|jpg|jpeg|gif|mp4|avi|mov|pdf)\s*["\']?.*?["\']?\)$'
)
RGB_COLOR_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$")
def build_markdown_table(rows_data: List[List[str]]) -> str:
"""将二维列表转换为 Markdown 表格格式"""
if not rows_data or not rows_data[0]:
return ""
md_lines = []
for i, row_data in enumerate(rows_data):
row_text = [cell if cell else "" for cell in row_data]
md_lines.append("| " + " | ".join(row_text) + " |")
if i == 0:
md_lines.append("|" + " | ".join(["---"] * len(row_text)) + " |")
return "\n".join(md_lines) + "\n\n"
def flush_list_stack(list_stack: List[str], target: List[str]) -> None:
"""将列表堆栈中的非空项添加到目标列表并清空堆栈"""
for item in list_stack:
if item:
target.append(item + "\n")
list_stack.clear()
def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipExtFile]:
"""安全地从 ZipFile 中打开文件,防止路径遍历攻击"""
if not name:
return None
if name.startswith("/") or name.startswith("\\"):
return None
if name.startswith(".."):
return None
if "/../" in name or name.endswith("/.."):
return None
if "\\" in name:
return None
if "/" not in name:
return None
return zip_file.open(name)
def normalize_markdown_whitespace(content: str) -> str:
lines = content.split("\n")
result = []
empty_count = 0
for line in lines:
stripped = line.strip()
if not stripped:
empty_count += 1
if empty_count == 1:
result.append(line)
else:
empty_count = 0
result.append(line)
return "\n".join(result)
def is_valid_docx(file_path: str) -> bool:
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
for required in required_files:
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False
def is_valid_pptx(file_path: str) -> bool:
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
required_files = [
"[Content_Types].xml",
"_rels/.rels",
"ppt/presentation.xml",
]
for required in required_files:
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False
def is_valid_xlsx(file_path: str) -> bool:
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
for required in required_files:
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False
def remove_markdown_images(markdown_text: str) -> str:
return IMAGE_PATTERN.sub("", markdown_text)
def filter_markdown_content(content: str) -> str:
"""过滤 markdown 内容,保留文本、表格、列表和基本格式"""
lines = content.split("\n")
filtered_lines = []
for line in lines:
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("<!--") and stripped.endswith("-->"):
continue
if stripped.startswith("![") or stripped.startswith("![]"):
continue
if "<img" in stripped or "</img>" in stripped:
continue
if MEDIA_LINK_PATTERN.match(stripped):
continue
if RGB_COLOR_PATTERN.match(stripped):
continue
line = re.sub(r'<span[^>]*style="[^"]*"[^>]*>(.*?)</span>', r"\1", line)
line = re.sub(r"<span[^>]*>(.*?)</span>", r"\1", line)
line = re.sub(r"\s+", " ", line).strip()
if line:
filtered_lines.append(line)
return "\n".join(filtered_lines)
def extract_titles(markdown_text: str) -> List[str]:
"""提取 markdown 文本中的所有标题行1-6级"""
title_lines = []
for line in markdown_text.split("\n"):
if get_heading_level(line) > 0:
title_lines.append(line.lstrip())
return title_lines
def get_heading_level(line: str) -> int:
stripped = line.lstrip()
if not stripped.startswith("#"):
return 0
level = 0
for char in stripped:
if char == "#":
level += 1
else:
break
return level if 1 <= level <= 6 else 0
def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
"""提取所有指定标题及其下级内容(每个包含上级标题)"""
lines = markdown_text.split("\n")
match_indices = []
for i, line in enumerate(lines):
level = get_heading_level(line)
if level > 0:
stripped = line.lstrip()
title_text = stripped[level:].strip()
if title_text == title_name:
match_indices.append(i)
if not match_indices:
return None
result_lines = []
for idx in match_indices:
target_level = get_heading_level(lines[idx])
parent_titles = []
current_level = target_level
for i in range(idx - 1, -1, -1):
line_level = get_heading_level(lines[i])
if line_level > 0 and line_level < current_level:
parent_titles.append(lines[i])
current_level = line_level
if current_level == 1:
break
parent_titles.reverse()
result_lines.extend(parent_titles)
result_lines.append(lines[idx])
for i in range(idx + 1, len(lines)):
line = lines[i]
line_level = get_heading_level(line)
if line_level == 0 or line_level > target_level:
result_lines.append(line)
else:
break
return "\n".join(result_lines)
def search_markdown(
content: str, pattern: str, context_lines: int = 0
) -> Optional[str]:
"""使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文"""
try:
regex = re.compile(pattern)
except re.error:
return None
lines = content.split("\n")
non_empty_indices = []
non_empty_to_original = {}
for i, line in enumerate(lines):
if line.strip():
non_empty_indices.append(i)
non_empty_to_original[i] = len(non_empty_indices) - 1
matched_non_empty_indices = []
for orig_idx in non_empty_indices:
if regex.search(lines[orig_idx]):
matched_non_empty_indices.append(non_empty_to_original[orig_idx])
if not matched_non_empty_indices:
return None
merged_ranges = []
current_start = matched_non_empty_indices[0]
current_end = matched_non_empty_indices[0]
for idx in matched_non_empty_indices[1:]:
if idx - current_end <= context_lines * 2:
current_end = idx
else:
merged_ranges.append((current_start, current_end))
current_start = idx
current_end = idx
merged_ranges.append((current_start, current_end))
results = []
for start, end in merged_ranges:
context_start_idx = max(0, start - context_lines)
context_end_idx = min(len(non_empty_indices) - 1, end + context_lines)
start_line_idx = non_empty_indices[context_start_idx]
end_line_idx = non_empty_indices[context_end_idx]
selected_indices = set(
non_empty_indices[context_start_idx : context_end_idx + 1]
)
result_lines = [
line
for i, line in enumerate(lines)
if start_line_idx <= i <= end_line_idx
and (line.strip() or i in selected_indices)
]
results.append("\n".join(result_lines))
return "\n---\n".join(results)
def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(file_path)
if not result.text_content.strip():
return None, "文档为空"
return result.text_content, None
except ImportError:
return None, "MarkItDown 库未安装"
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try:
from docx import Document
except ImportError:
return None, "python-docx 库未安装"
try:
doc = Document(file_path)
def get_heading_level(para: Any) -> int:
if para.style and para.style.name:
style_name = para.style.name
if "Heading 1" in style_name or "Title" in style_name:
return 1
elif "Heading 2" in style_name:
return 2
elif "Heading 3" in style_name:
return 3
elif "Heading 4" in style_name:
return 4
elif "Heading 5" in style_name:
return 5
elif "Heading 6" in style_name:
return 6
return 0
def get_list_style(para: Any) -> Optional[str]:
if not para.style or not para.style.name:
return None
style_name = para.style.name
if style_name.startswith("List Bullet") or style_name == "Bullet":
return "bullet"
elif style_name.startswith("List Number") or style_name == "Number":
return "number"
return None
def convert_runs_to_markdown(runs: List[Any]) -> str:
result = []
for run in runs:
text = run.text
if not text:
continue
if run.bold:
text = f"**{text}**"
if run.italic:
text = f"*{text}*"
if run.underline:
text = f"<u>{text}</u>"
result.append(text)
return "".join(result)
def convert_table_to_markdown(table: Any) -> str:
rows_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip().replace("\n", " ")
row_data.append(cell_text)
rows_data.append(row_data)
return build_markdown_table(rows_data)
markdown_lines = []
prev_was_list = False
for para in doc.paragraphs:
text = convert_runs_to_markdown(para.runs)
if not text.strip():
continue
heading_level = get_heading_level(para)
if heading_level > 0:
markdown_lines.append(f"{'#' * heading_level} {text}")
prev_was_list = False
else:
list_style = get_list_style(para)
if list_style == "bullet":
if not prev_was_list and markdown_lines:
markdown_lines.append("")
markdown_lines.append(f"- {text}")
prev_was_list = True
elif list_style == "number":
if not prev_was_list and markdown_lines:
markdown_lines.append("")
markdown_lines.append(f"1. {text}")
prev_was_list = True
else:
if prev_was_list and markdown_lines:
markdown_lines.append("")
markdown_lines.append(text)
markdown_lines.append("")
prev_was_list = False
for table in doc.tables:
table_md = convert_table_to_markdown(table)
markdown_lines.append(table_md)
markdown_lines.append("")
content = "\n".join(markdown_lines)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"python-docx 解析失败: {str(e)}"
def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
namespaces = {"w": word_namespace}
def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
return style_to_level.get(style_id, 0)
def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
return style_to_list.get(style_id, None)
def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
texts = []
for run in para.findall(".//w:r", namespaces=namespaces):
text_elem = run.find(".//w:t", namespaces=namespaces)
if text_elem is not None and text_elem.text:
text = text_elem.text
bold = run.find(".//w:b", namespaces=namespaces) is not None
italic = run.find(".//w:i", namespaces=namespaces) is not None
if bold:
text = f"**{text}**"
if italic:
text = f"*{text}*"
texts.append(text)
return "".join(texts).strip()
def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
rows = table_elem.findall(".//w:tr", namespaces=namespaces)
if not rows:
return ""
md_lines = []
for i, row in enumerate(rows):
cells = row.findall(".//w:tc", namespaces=namespaces)
cell_texts = []
for cell in cells:
cell_text = extract_text_with_formatting(cell, namespaces)
cell_text = cell_text.replace("\n", " ").strip()
cell_texts.append(cell_text if cell_text else "")
if cell_texts:
md_line = "| " + " | ".join(cell_texts) + " |"
md_lines.append(md_line)
if i == 0:
sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
md_lines.append(sep_line)
return "\n".join(md_lines)
try:
style_to_level = {}
style_to_list = {}
markdown_lines = []
with zipfile.ZipFile(file_path) as zip_file:
try:
styles_file = safe_open_zip(zip_file, "word/styles.xml")
if styles_file:
styles_root = ET.parse(styles_file).getroot()
for style in styles_root.findall(
".//w:style", namespaces=namespaces
):
style_id = style.get(f"{{{word_namespace}}}styleId")
style_name_elem = style.find("w:name", namespaces=namespaces)
if style_id and style_name_elem is not None:
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
if style_name:
if style_name == "Title":
style_to_level[style_id] = 1
elif style_name == "heading 1":
style_to_level[style_id] = 1
elif style_name == "heading 2":
style_to_level[style_id] = 2
elif style_name == "heading 3":
style_to_level[style_id] = 3
elif style_name == "heading 4":
style_to_level[style_id] = 4
elif style_name == "heading 5":
style_to_level[style_id] = 5
elif style_name == "heading 6":
style_to_level[style_id] = 6
elif (
style_name.startswith("List Bullet")
or style_name == "Bullet"
):
style_to_list[style_id] = "bullet"
elif (
style_name.startswith("List Number")
or style_name == "Number"
):
style_to_list[style_id] = "number"
except Exception:
pass
document_file = safe_open_zip(zip_file, "word/document.xml")
if not document_file:
return None, "document.xml 不存在或无法访问"
root = ET.parse(document_file).getroot()
body = root.find(".//w:body", namespaces=namespaces)
if body is None:
return None, "document.xml 中未找到 w:body 元素"
for child in body.findall("./*", namespaces=namespaces):
if child.tag.endswith("}p"):
style_elem = child.find(".//w:pStyle", namespaces=namespaces)
style_id = (
style_elem.get(f"{{{word_namespace}}}val")
if style_elem is not None
else None
)
heading_level = get_heading_level(style_id, style_to_level)
list_style = get_list_style(style_id, style_to_list)
para_text = extract_text_with_formatting(child, namespaces)
if para_text:
if heading_level > 0:
markdown_lines.append(f"{'#' * heading_level} {para_text}")
elif list_style == "bullet":
markdown_lines.append(f"- {para_text}")
elif list_style == "number":
markdown_lines.append(f"1. {para_text}")
else:
markdown_lines.append(para_text)
markdown_lines.append("")
elif child.tag.endswith("}tbl"):
table_md = convert_table_to_markdown(child, namespaces)
if table_md:
markdown_lines.append(table_md)
markdown_lines.append("")
content = "\n".join(markdown_lines)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"XML 解析失败: {str(e)}"
def parse_pptx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(file_path)
if not result.text_content.strip():
return None, "文档为空"
filtered_content = filter_markdown_content(result.text_content)
if not filtered_content.strip():
return None, "过滤后文档为空"
return filtered_content, None
except ImportError:
return None, "MarkItDown 库未安装"
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def extract_formatted_text_pptx(runs: List[Any]) -> str:
result = []
for run in runs:
if not run.text:
continue
text = run.text
font = run.font
is_bold = getattr(font, "bold", False) or False
is_italic = getattr(font, "italic", False) or False
if is_bold and is_italic:
text = f"***{text}***"
elif is_bold:
text = f"**{text}**"
elif is_italic:
text = f"*{text}*"
result.append(text)
return "".join(result).strip()
def convert_table_to_md_pptx(table: Any) -> str:
rows_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
cell_content = []
for para in cell.text_frame.paragraphs:
text = extract_formatted_text_pptx(para.runs)
if text:
cell_content.append(text)
cell_text = " ".join(cell_content).strip()
row_data.append(cell_text if cell_text else "")
rows_data.append(row_data)
return build_markdown_table(rows_data)
def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try:
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
except ImportError:
return None, "python-pptx 库未安装"
try:
prs = Presentation(file_path)
md_content = []
for slide_num, slide in enumerate(prs.slides, 1):
md_content.append(f"\n## Slide {slide_num}\n")
list_stack = []
for shape in slide.shapes:
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
continue
if hasattr(shape, "has_table") and shape.has_table:
if list_stack:
md_content.append(
"\n" + "\n".join([x for x in list_stack if x]) + "\n"
)
list_stack.clear()
table_md = convert_table_to_md_pptx(shape.table)
md_content.append(table_md)
if hasattr(shape, "text_frame"):
for para in shape.text_frame.paragraphs:
pPr = para._element.pPr
is_list = False
if pPr is not None:
is_list = (
para.level > 0
or pPr.find(
".//a:buChar",
namespaces={
"a": "http://schemas.openxmlformats.org/drawingml/2006/main"
},
)
is not None
or pPr.find(
".//a:buAutoNum",
namespaces={
"a": "http://schemas.openxmlformats.org/drawingml/2006/main"
},
)
is not None
)
if is_list:
level = para.level
while len(list_stack) <= level:
list_stack.append("")
text = extract_formatted_text_pptx(para.runs)
if text:
pPr = para._element.pPr
is_ordered = (
pPr is not None
and pPr.find(
".//a:buAutoNum",
namespaces={
"a": "http://schemas.openxmlformats.org/drawingml/2006/main"
},
)
is not None
)
marker = "1. " if is_ordered else "- "
indent = " " * level
list_stack[level] = f"{indent}{marker}{text}"
for i in range(len(list_stack)):
if list_stack[i]:
md_content.append(list_stack[i] + "\n")
list_stack[i] = ""
else:
if list_stack:
md_content.append(
"\n"
+ "\n".join([x for x in list_stack if x])
+ "\n"
)
list_stack.clear()
text = extract_formatted_text_pptx(para.runs)
if text:
md_content.append(f"{text}\n")
if list_stack:
md_content.append("\n" + "\n".join([x for x in list_stack if x]) + "\n")
list_stack.clear()
md_content.append("---\n")
content = "\n".join(md_content)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"python-pptx 解析失败: {str(e)}"
def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
pptx_namespace = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
}
def extract_text_with_formatting_xml(text_elem: Any, namespaces: dict) -> str:
result = []
runs = text_elem.findall(".//a:r", namespaces=namespaces)
for run in runs:
t_elem = run.find(".//a:t", namespaces=namespaces)
if t_elem is None or not t_elem.text:
continue
text = t_elem.text
rPr = run.find(".//a:rPr", namespaces=namespaces)
is_bold = False
is_italic = False
if rPr is not None:
is_bold = rPr.find(".//a:b", namespaces=namespaces) is not None
is_italic = rPr.find(".//a:i", namespaces=namespaces) is not None
if is_bold and is_italic:
text = f"***{text}***"
elif is_bold:
text = f"**{text}**"
elif is_italic:
text = f"*{text}*"
result.append(text)
return "".join(result).strip() if result else ""
def convert_table_to_md_xml(table_elem: Any, namespaces: dict) -> str:
rows = table_elem.findall(".//a:tr", namespaces=namespaces)
if not rows:
return ""
rows_data = []
for row in rows:
cells = row.findall(".//a:tc", namespaces=namespaces)
row_data = []
for cell in cells:
cell_text = extract_text_with_formatting_xml(cell, namespaces)
if cell_text:
cell_text = cell_text.replace("\n", " ").replace("\r", " ")
row_data.append(cell_text if cell_text else "")
rows_data.append(row_data)
return build_markdown_table(rows_data)
def is_list_item_xml(p_elem: Any, namespaces: dict) -> Tuple[bool, bool]:
if p_elem is None:
return False, False
pPr = p_elem.find(".//a:pPr", namespaces=namespaces)
if pPr is None:
return False, False
buChar = pPr.find(".//a:buChar", namespaces=namespaces)
if buChar is not None:
return True, False
buAutoNum = pPr.find(".//a:buAutoNum", namespaces=namespaces)
if buAutoNum is not None:
return True, True
return False, False
def get_indent_level_xml(p_elem: Any, namespaces: dict) -> int:
if p_elem is None:
return 0
pPr = p_elem.find(".//a:pPr", namespaces=namespaces)
if pPr is None:
return 0
lvl = pPr.get("lvl")
return int(lvl) if lvl else 0
try:
md_content = []
with zipfile.ZipFile(file_path) as zip_file:
slide_files = [
f
for f in zip_file.namelist()
if re.match(r"ppt/slides/slide\d+\.xml$", f)
]
for slide_idx, slide_file in enumerate(slide_files, 1):
md_content.append("\n## Slide {}\n".format(slide_idx))
with zip_file.open(slide_file) as slide_xml:
slide_root = ET.parse(slide_xml).getroot()
tx_bodies = slide_root.findall(
".//p:sp/p:txBody", namespaces=pptx_namespace
)
tables = slide_root.findall(".//a:tbl", namespaces=pptx_namespace)
for table in tables:
table_md = convert_table_to_md_xml(table, pptx_namespace)
if table_md:
md_content.append(table_md)
for tx_body in tx_bodies:
paragraphs = tx_body.findall(
".//a:p", namespaces=pptx_namespace
)
list_stack = []
for para in paragraphs:
is_list, is_ordered = is_list_item_xml(para, pptx_namespace)
if is_list:
level = get_indent_level_xml(para, pptx_namespace)
while len(list_stack) <= level:
list_stack.append("")
text = extract_text_with_formatting_xml(
para, pptx_namespace
)
if text:
marker = "1. " if is_ordered else "- "
indent = " " * level
list_stack[level] = f"{indent}{marker}{text}"
for i in range(len(list_stack)):
if list_stack[i]:
md_content.append(list_stack[i] + "\n")
list_stack[i] = ""
else:
if list_stack:
flush_list_stack(list_stack, md_content)
text = extract_text_with_formatting_xml(
para, pptx_namespace
)
if text:
md_content.append(f"{text}\n")
if list_stack:
flush_list_stack(list_stack, md_content)
md_content.append("---\n")
content = "\n".join(md_content)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"XML 解析失败: {str(e)}"
def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(file_path)
if not result.text_content.strip():
return None, "文档为空"
return result.text_content, None
except ImportError:
return None, "MarkItDown 库未安装"
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try:
import pandas as pd
from tabulate import tabulate
except ImportError as e:
missing_lib = "pandas" if "pandas" in str(e) else "tabulate"
return None, f"{missing_lib} 库未安装"
try:
df = pd.read_excel(file_path)
if len(df) == 0:
return None, "Excel 文件为空"
markdown_content = tabulate(
df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
)
markdown_with_header = (
f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}"
)
return markdown_with_header, None
except Exception as e:
return None, f"pandas 解析失败: {str(e)}"
def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
xlsx_namespace = {
"main": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
}
def parse_col_index(cell_ref: str) -> int:
col_index = 0
for char in cell_ref:
if char.isalpha():
col_index = col_index * 26 + (ord(char) - ord("A") + 1)
else:
break
return col_index - 1
def parse_cell_value(cell: ET.Element, shared_strings: List[str]) -> str:
cell_type = cell.attrib.get("t")
cell_value_elem = cell.find("main:v", xlsx_namespace)
if cell_value_elem is not None and cell_value_elem.text:
cell_value = cell_value_elem.text
if cell_type == "s":
try:
idx = int(cell_value)
if 0 <= idx < len(shared_strings):
text = shared_strings[idx]
return text.replace("\n", " ").replace("\r", "")
except (ValueError, IndexError):
pass
return ""
elif cell_type == "b":
return "TRUE" if cell_value == "1" else "FALSE"
elif cell_type == "str":
return cell_value.replace("\n", " ").replace("\r", "")
elif cell_type == "inlineStr":
is_elem = cell.find("main:is", xlsx_namespace)
if is_elem is not None:
t_elem = is_elem.find("main:t", xlsx_namespace)
if t_elem is not None and t_elem.text:
return t_elem.text.replace("\n", " ").replace("\r", "")
return ""
elif cell_type == "e":
error_codes = {
"#NULL!": "空引用错误",
"#DIV/0!": "除零错误",
"#VALUE!": "值类型错误",
"#REF!": "无效引用",
"#NAME?": "名称错误",
"#NUM!": "数值错误",
"#N/A": "值不可用",
}
return error_codes.get(cell_value, f"错误: {cell_value}")
elif cell_type == "d":
return f"[日期] {cell_value}"
elif cell_type == "n":
return cell_value
elif cell_type is None:
try:
float_val = float(cell_value)
if float_val.is_integer():
return str(int(float_val))
return cell_value
except ValueError:
return cell_value
else:
return cell_value
else:
return ""
def get_non_empty_columns(data: List[List[str]]) -> set:
non_empty_cols = set()
for row in data:
for col_idx, cell in enumerate(row):
if cell and cell.strip():
non_empty_cols.add(col_idx)
return non_empty_cols
def filter_columns(row: List[str], non_empty_cols: set) -> List[str]:
return [row[i] if i < len(row) else "" for i in sorted(non_empty_cols)]
def data_to_markdown(data: List[List[str]], sheet_name: str) -> str:
if not data or not data[0]:
return f"## {sheet_name}\n\n*工作表为空*"
md_lines = []
md_lines.append(f"## {sheet_name}")
md_lines.append("")
headers = data[0]
non_empty_cols = get_non_empty_columns(data)
if not non_empty_cols:
return f"## {sheet_name}\n\n*工作表为空*"
filtered_headers = filter_columns(headers, non_empty_cols)
header_line = "| " + " | ".join(filtered_headers) + " |"
md_lines.append(header_line)
separator_line = "|" + "|".join(["---"] * len(filtered_headers)) + "|"
md_lines.append(separator_line)
for row in data[1:]:
filtered_row = filter_columns(row, non_empty_cols)
row_line = "| " + " | ".join(filtered_row) + " |"
md_lines.append(row_line)
md_lines.append("")
return "\n".join(md_lines)
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
sheet_names = []
try:
with zip_file.open("xl/workbook.xml") as f:
root = ET.parse(f).getroot()
sheet_elements = root.findall(".//main:sheet", xlsx_namespace)
for sheet in sheet_elements:
sheet_name = sheet.attrib.get("name", "")
if sheet_name:
sheet_names.append(sheet_name)
except KeyError:
return None, "无法解析工作表名称"
if not sheet_names:
return None, "未找到工作表"
shared_strings = []
try:
with zip_file.open("xl/sharedStrings.xml") as f:
root = ET.parse(f).getroot()
for si in root.findall(".//main:si", xlsx_namespace):
t_elem = si.find(".//main:t", xlsx_namespace)
if t_elem is not None and t_elem.text:
shared_strings.append(t_elem.text)
else:
shared_strings.append("")
except KeyError:
pass
markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n"
markdown_content += f"来源: {file_path}\n\n"
for sheet_index, sheet_name in enumerate(sheet_names, start=1):
try:
worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml"
with zip_file.open(worksheet_path) as f:
root = ET.parse(f).getroot()
sheet_data = root.find("main:sheetData", xlsx_namespace)
rows = []
if sheet_data is not None:
row_elements = sheet_data.findall(
"main:row", xlsx_namespace
)
for row_elem in row_elements:
cells = row_elem.findall("main:c", xlsx_namespace)
col_dict = {}
for cell in cells:
cell_ref = cell.attrib.get("r", "")
if not cell_ref:
continue
col_index = parse_col_index(cell_ref)
cell_value = parse_cell_value(cell, shared_strings)
col_dict[col_index] = cell_value
if col_dict:
max_col = max(col_dict.keys())
row_data = [
col_dict.get(i, "") for i in range(max_col + 1)
]
rows.append(row_data)
table_md = data_to_markdown(rows, sheet_name)
markdown_content += table_md + "\n\n"
except KeyError:
markdown_content += f"## {sheet_name}\n\n*工作表解析失败*\n\n"
if not markdown_content.strip():
return None, "解析结果为空"
return markdown_content, None
except Exception as e:
return None, f"XML 解析失败: {str(e)}"
def detect_file_type(file_path: str) -> Optional[str]:
"""检测文件类型,返回 'docx''pptx''xlsx'"""
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext == ".docx":
if is_valid_docx(file_path):
return "docx"
elif ext == ".pptx":
if is_valid_pptx(file_path):
return "pptx"
elif ext == ".xlsx":
if is_valid_xlsx(file_path):
return "xlsx"
return None
def main() -> None:
parser = argparse.ArgumentParser(
description="将 DOCX、PPTX 或 XLSX 文件解析为 Markdown"
)
parser.add_argument("file_path", help="DOCX、PPTX 或 XLSX 文件的绝对路径")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
if not os.path.exists(args.file_path):
print(f"错误: 文件不存在: {args.file_path}")
sys.exit(1)
file_type = detect_file_type(args.file_path)
if file_type is None:
print(f"错误: 文件不是有效的 DOCX、PPTX 或 XLSX 格式: {args.file_path}")
sys.exit(1)
if file_type == "docx":
parsers = [
("MarkItDown", parse_docx_with_markitdown),
("python-docx", parse_docx_with_python_docx),
("XML 原生解析", parse_docx_with_xml),
]
elif file_type == "pptx":
parsers = [
("MarkItDown", parse_pptx_with_markitdown),
("python-pptx", parse_pptx_with_python_pptx),
("XML 原生解析", parse_pptx_with_xml),
]
else:
parsers = [
("MarkItDown", parse_xlsx_with_markitdown),
("pandas", parse_xlsx_with_pandas),
("XML 原生解析", parse_xlsx_with_xml),
]
failures = []
content = None
for parser_name, parser_func in parsers:
content, error = parser_func(args.file_path)
if content is not None:
content = remove_markdown_images(content)
content = normalize_markdown_whitespace(content)
break
else:
failures.append(f"- {parser_name}: {error}")
if content is None:
print("所有解析方法均失败:")
for failure in failures:
print(failure)
sys.exit(1)
if args.count:
print(len(content.replace("\n", "")))
elif args.lines:
print(len(content.split("\n")))
elif args.titles:
titles = extract_titles(content)
for title in titles:
print(title)
elif args.title_content:
title_content = extract_title_content(content, args.title_content)
if title_content is None:
print(f"错误: 未找到标题 '{args.title_content}'")
sys.exit(1)
print(title_content, end="")
elif args.search:
search_result = search_markdown(content, args.search, args.context)
if search_result is None:
print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
sys.exit(1)
print(search_result, end="")
else:
print(content, end="")
if __name__ == "__main__":
main()