1
0
Files
Skill/skills/lyxy-reader-docx/scripts/docx_parser.py
2026-02-12 17:57:05 +08:00

552 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""整合的 DOCX 解析器,按优先级尝试多种解析方法:
1. MarkItDown (微软官方库)
2. python-docx (成熟的 Python 库)
3. XML 原生解析 (备选方案)
代码风格要求:
- Python 3.6+ 兼容
- 遵循 PEP 8 规范
- 所有公共 API 函数添加类型提示
- 字符串优先内联使用不提取为常量除非被使用超过3次
- 其他被多次使用的对象根据具体情况可考虑被提取为常量(如正则表达式)
- 模块级和公共 API 函数保留文档字符串
- 内部辅助函数不添加文档字符串(函数名足够描述)
- 变量命名清晰,避免单字母变量名
"""
import argparse
import os
import re
import sys
import zipfile
import xml.etree.ElementTree as ET
from typing import List, Optional, Tuple
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
def normalize_markdown_whitespace(content: str) -> str:
lines = content.split("\n")
result = []
empty_count = 0
for line in lines:
stripped = line.strip()
if not stripped:
empty_count += 1
if empty_count == 1:
result.append(line)
else:
empty_count = 0
result.append(line)
return "\n".join(result)
def is_valid_docx(file_path: str) -> bool:
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
for required in required_files:
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False
def remove_markdown_images(markdown_text: str) -> str:
return IMAGE_PATTERN.sub("", markdown_text)
def extract_titles(markdown_text: str) -> List[str]:
"""提取 markdown 文本中的所有标题行1-6级"""
title_lines = []
for line in markdown_text.split("\n"):
stripped = line.lstrip()
if stripped.startswith("#"):
level = 0
for char in stripped:
if char == "#":
level += 1
else:
break
if 1 <= level <= 6:
title_lines.append(stripped)
return title_lines
def get_heading_level(line: str) -> int:
stripped = line.lstrip()
if not stripped.startswith("#"):
return 0
level = 0
for char in stripped:
if char == "#":
level += 1
else:
break
return level if 1 <= level <= 6 else 0
def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
"""提取所有指定标题及其下级内容(每个包含上级标题)"""
lines = markdown_text.split("\n")
match_indices = []
for i, line in enumerate(lines):
level = get_heading_level(line)
if level > 0:
stripped = line.lstrip()
title_text = stripped[level:].strip()
if title_text == title_name:
match_indices.append(i)
if not match_indices:
return None
result_lines = []
for idx in match_indices:
target_level = get_heading_level(lines[idx])
parent_titles = []
current_level = target_level
for i in range(idx - 1, -1, -1):
line_level = get_heading_level(lines[i])
if line_level > 0 and line_level < current_level:
parent_titles.append(lines[i])
current_level = line_level
if current_level == 1:
break
parent_titles.reverse()
result_lines.extend(parent_titles)
result_lines.append(lines[idx])
for i in range(idx + 1, len(lines)):
line = lines[i]
line_level = get_heading_level(line)
if line_level == 0 or line_level > target_level:
result_lines.append(line)
else:
break
return "\n".join(result_lines)
def search_markdown(
content: str, pattern: str, context_lines: int = 0
) -> Optional[str]:
"""使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文"""
try:
regex = re.compile(pattern)
except re.error:
return None
lines = content.split("\n")
non_empty_indices = []
non_empty_to_original = {}
for i, line in enumerate(lines):
if line.strip():
non_empty_indices.append(i)
non_empty_to_original[i] = len(non_empty_indices) - 1
matched_non_empty_indices = []
for orig_idx in non_empty_indices:
if regex.search(lines[orig_idx]):
matched_non_empty_indices.append(non_empty_to_original[orig_idx])
if not matched_non_empty_indices:
return None
merged_ranges = []
current_start = matched_non_empty_indices[0]
current_end = matched_non_empty_indices[0]
for idx in matched_non_empty_indices[1:]:
if idx - current_end <= context_lines * 2:
current_end = idx
else:
merged_ranges.append((current_start, current_end))
current_start = idx
current_end = idx
merged_ranges.append((current_start, current_end))
results = []
for start, end in merged_ranges:
actual_start = max(0, start - context_lines)
actual_end = min(len(non_empty_indices) - 1, end + context_lines)
start_line_idx = non_empty_indices[actual_start]
end_line_idx = non_empty_indices[actual_end]
selected_indices = set(non_empty_indices[actual_start : actual_end + 1])
result_lines = [
line
for i, line in enumerate(lines)
if start_line_idx <= i <= end_line_idx
and (line.strip() or i in selected_indices)
]
results.append("\n".join(result_lines))
return "\n---\n".join(results)
def parse_with_markitdown(file_path: str) -> Optional[Tuple[str, None]]:
try:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(file_path)
if not result.text_content.strip():
return None, "文档为空"
return result.text_content, None
except ImportError:
return None, "MarkItDown 库未安装"
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def parse_with_python_docx(file_path: str) -> Optional[Tuple[str, None]]:
try:
from docx import Document
except ImportError:
return None, "python-docx 库未安装"
try:
doc = Document(file_path)
def get_heading_level(para) -> int:
if para.style and para.style.name:
style_name = para.style.name
if "Heading 1" in style_name or "Title" in style_name:
return 1
elif "Heading 2" in style_name:
return 2
elif "Heading 3" in style_name:
return 3
elif "Heading 4" in style_name:
return 4
elif "Heading 5" in style_name:
return 5
elif "Heading 6" in style_name:
return 6
return 0
def get_list_style(para) -> Optional[str]:
if not para.style or not para.style.name:
return None
style_name = para.style.name
if "List Bullet" in style_name or "Bullet" in style_name:
return "bullet"
elif "List Number" in style_name or "Number" in style_name:
return "number"
return None
def convert_runs_to_markdown(runs) -> str:
result = []
for run in runs:
text = run.text
if not text:
continue
if run.bold:
text = f"**{text}**"
if run.italic:
text = f"*{text}*"
if run.underline:
text = f"<u>{text}</u>"
result.append(text)
return "".join(result)
def convert_table_to_markdown(table) -> str:
md_lines = []
for i, row in enumerate(table.rows):
cells = []
for cell in row.cells:
cell_text = cell.text.strip().replace("\n", " ")
cells.append(cell_text)
if cells:
md_line = "| " + " | ".join(cells) + " |"
md_lines.append(md_line)
if i == 0:
sep_line = "| " + " | ".join(["---"] * len(cells)) + " |"
md_lines.append(sep_line)
return "\n".join(md_lines)
markdown_lines = []
for para in doc.paragraphs:
text = convert_runs_to_markdown(para.runs)
if not text.strip():
continue
heading_level = get_heading_level(para)
if heading_level > 0:
markdown_lines.append(f"{'#' * heading_level} {text}")
else:
list_style = get_list_style(para)
if list_style == "bullet":
markdown_lines.append(f"- {text}")
elif list_style == "number":
markdown_lines.append(f"1. {text}")
else:
markdown_lines.append(text)
markdown_lines.append("")
for table in doc.tables:
table_md = convert_table_to_markdown(table)
markdown_lines.append(table_md)
markdown_lines.append("")
content = "\n".join(markdown_lines)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"python-docx 解析失败: {str(e)}"
def parse_with_xml(file_path: str) -> Optional[Tuple[str, None]]:
word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
namespaces = {"w": word_namespace}
def safe_open_zip(zip_file: zipfile.ZipFile, name: str):
if name.startswith("..") or "/" not in name:
return None
return zip_file.open(name)
def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
return style_to_level.get(style_id, 0)
def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
return style_to_list.get(style_id, None)
def extract_text_with_formatting(para, namespaces: dict) -> str:
texts = []
for run in para.findall(".//w:r", namespaces=namespaces):
text_elem = run.find(".//w:t", namespaces=namespaces)
if text_elem is not None and text_elem.text:
text = text_elem.text
bold = run.find(".//w:b", namespaces=namespaces) is not None
italic = run.find(".//w:i", namespaces=namespaces) is not None
if bold:
text = f"**{text}**"
if italic:
text = f"*{text}*"
texts.append(text)
return "".join(texts).strip()
def convert_table_to_markdown(table_elem, namespaces: dict) -> str:
rows = table_elem.findall(".//w:tr", namespaces=namespaces)
if not rows:
return ""
md_lines = []
for i, row in enumerate(rows):
cells = row.findall(".//w:tc", namespaces=namespaces)
cell_texts = []
for cell in cells:
cell_text = extract_text_with_formatting(cell, namespaces)
cell_text = cell_text.replace("\n", " ").strip()
cell_texts.append(cell_text if cell_text else "")
if cell_texts:
md_line = "| " + " | ".join(cell_texts) + " |"
md_lines.append(md_line)
if i == 0:
sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
md_lines.append(sep_line)
return "\n".join(md_lines)
try:
style_to_level = {}
style_to_list = {}
markdown_lines = []
with zipfile.ZipFile(file_path) as zip_file:
try:
styles_file = safe_open_zip(zip_file, "word/styles.xml")
if styles_file:
styles_root = ET.parse(styles_file)
for style in styles_root.findall(
".//w:style", namespaces=namespaces
):
style_id = style.get(f"{{{word_namespace}}}styleId")
style_name_elem = style.find("w:name", namespaces=namespaces)
if style_id and style_name_elem is not None:
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
if style_name:
if style_name == "Title":
style_to_level[style_id] = 1
elif style_name == "heading 1":
style_to_level[style_id] = 1
elif style_name == "heading 2":
style_to_level[style_id] = 2
elif style_name == "heading 3":
style_to_level[style_id] = 3
elif style_name == "heading 4":
style_to_level[style_id] = 4
elif style_name == "heading 5":
style_to_level[style_id] = 5
elif style_name == "heading 6":
style_to_level[style_id] = 6
elif (
"List Bullet" in style_name
or "Bullet" in style_name
):
style_to_list[style_id] = "bullet"
elif (
"List Number" in style_name
or "Number" in style_name
):
style_to_list[style_id] = "number"
except Exception:
pass
document_file = safe_open_zip(zip_file, "word/document.xml")
if not document_file:
return None, "document.xml 不存在或无法访问"
root = ET.parse(document_file)
body = root.find(".//w:body", namespaces=namespaces)
if body is None:
return None, "document.xml 中未找到 w:body 元素"
for child in body.findall("./*", namespaces=namespaces):
if child.tag.endswith("}p"):
style_elem = child.find(".//w:pStyle", namespaces=namespaces)
style_id = (
style_elem.get(f"{{{word_namespace}}}val")
if style_elem is not None
else None
)
heading_level = get_heading_level(style_id, style_to_level)
list_style = get_list_style(style_id, style_to_list)
para_text = extract_text_with_formatting(child, namespaces)
if para_text:
if heading_level > 0:
markdown_lines.append(f"{'#' * heading_level} {para_text}")
elif list_style == "bullet":
markdown_lines.append(f"- {para_text}")
elif list_style == "number":
markdown_lines.append(f"1. {para_text}")
else:
markdown_lines.append(para_text)
markdown_lines.append("")
elif child.tag.endswith("}tbl"):
table_md = convert_table_to_markdown(child, namespaces)
if table_md:
markdown_lines.append(table_md)
markdown_lines.append("")
content = "\n".join(markdown_lines)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"XML 解析失败: {str(e)}"
def main() -> None:
parser = argparse.ArgumentParser(description="将 DOCX 文件解析为 Markdown")
parser.add_argument("file_path", help="DOCX 文件的绝对路径")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
if not os.path.exists(args.file_path):
print(f"错误: 文件不存在: {args.file_path}")
sys.exit(1)
if not args.file_path.lower().endswith(".docx"):
print(f"警告: 文件扩展名不是 .docx: {args.file_path}")
if not is_valid_docx(args.file_path):
print(f"错误: 文件不是有效的 DOCX 格式或已损坏: {args.file_path}")
sys.exit(1)
parsers = [
("MarkItDown", parse_with_markitdown),
("python-docx", parse_with_python_docx),
("XML 原生解析", parse_with_xml),
]
failures = []
content = None
for parser_name, parser_func in parsers:
content, error = parser_func(args.file_path)
if content is not None:
content = remove_markdown_images(content)
content = normalize_markdown_whitespace(content)
break
else:
failures.append(f"- {parser_name}: {error}")
if content is None:
print("所有解析方法均失败:")
for failure in failures:
print(failure)
sys.exit(1)
if args.count:
print(len(content.replace("\n", "")))
elif args.lines:
print(len(content.split("\n")))
elif args.titles:
titles = extract_titles(content)
for title in titles:
print(title)
elif args.title_content:
title_content = extract_title_content(content, args.title_content)
if title_content is None:
print(f"错误: 未找到标题 '{args.title_content}'")
sys.exit(1)
print(title_content, end="")
elif args.search:
search_result = search_markdown(content, args.search, args.context)
if search_result is None:
print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
sys.exit(1)
print(search_result, end="")
else:
print(content, end="")
if __name__ == "__main__":
main()