- 创建 scripts/ 目录作为核心代码根目录 - 移动 core/, readers/, utils/ 到 scripts/ 下 - 移动 config.py, lyxy_document_reader.py 到 scripts/ - 移动 encoding_detection.py 到 scripts/utils/ - 更新 pyproject.toml 中的入口点路径和 pytest 配置 - 更新所有内部导入语句为 scripts.* 模块 - 更新 README.md 目录结构说明 - 更新 openspec/config.yaml 添加目录结构说明 - 删除无用的 main.py 此变更使项目结构更清晰,便于区分核心代码与测试、文档等支撑文件。
136 lines
5.9 KiB
Python
136 lines
5.9 KiB
Python
"""使用 XML 原生解析 DOCX 文件"""
|
|
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from scripts.core import build_markdown_table, safe_open_zip
|
|
|
|
|
|
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|
"""使用 XML 原生解析 DOCX 文件"""
|
|
word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
namespaces = {"w": word_namespace}
|
|
|
|
_STYLE_NAME_TO_HEADING = {
|
|
"title": 1, "heading 1": 1, "heading 2": 2, "heading 3": 3,
|
|
"heading 4": 4, "heading 5": 5, "heading 6": 6,
|
|
}
|
|
|
|
def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
|
|
return style_to_level.get(style_id, 0)
|
|
|
|
def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
|
|
return style_to_list.get(style_id, None)
|
|
|
|
def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
|
|
texts = []
|
|
for run in para.findall(".//w:r", namespaces=namespaces):
|
|
text_elem = run.find(".//w:t", namespaces=namespaces)
|
|
if text_elem is not None and text_elem.text:
|
|
text = text_elem.text
|
|
bold = run.find(".//w:b", namespaces=namespaces) is not None
|
|
italic = run.find(".//w:i", namespaces=namespaces) is not None
|
|
if bold:
|
|
text = f"**{text}**"
|
|
if italic:
|
|
text = f"*{text}*"
|
|
texts.append(text)
|
|
return "".join(texts).strip()
|
|
|
|
def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
|
|
rows = table_elem.findall(".//w:tr", namespaces=namespaces)
|
|
if not rows:
|
|
return ""
|
|
rows_data = []
|
|
for row in rows:
|
|
cells = row.findall(".//w:tc", namespaces=namespaces)
|
|
cell_texts = []
|
|
for cell in cells:
|
|
cell_text = extract_text_with_formatting(cell, namespaces)
|
|
cell_text = cell_text.replace("\n", " ").strip()
|
|
cell_texts.append(cell_text if cell_text else "")
|
|
if cell_texts:
|
|
rows_data.append(cell_texts)
|
|
return build_markdown_table(rows_data)
|
|
|
|
try:
|
|
style_to_level = {}
|
|
style_to_list = {}
|
|
markdown_lines = []
|
|
|
|
with zipfile.ZipFile(file_path) as zip_file:
|
|
try:
|
|
styles_file = safe_open_zip(zip_file, "word/styles.xml")
|
|
if styles_file:
|
|
styles_root = ET.parse(styles_file).getroot()
|
|
for style in styles_root.findall(
|
|
".//w:style", namespaces=namespaces
|
|
):
|
|
style_id = style.get(f"{{{word_namespace}}}styleId")
|
|
style_name_elem = style.find("w:name", namespaces=namespaces)
|
|
if style_id and style_name_elem is not None:
|
|
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
|
|
if style_name:
|
|
style_name_lower = style_name.lower()
|
|
if style_name_lower in _STYLE_NAME_TO_HEADING:
|
|
style_to_level[style_id] = _STYLE_NAME_TO_HEADING[style_name_lower]
|
|
elif (
|
|
style_name_lower.startswith("list bullet")
|
|
or style_name_lower == "bullet"
|
|
):
|
|
style_to_list[style_id] = "bullet"
|
|
elif (
|
|
style_name_lower.startswith("list number")
|
|
or style_name_lower == "number"
|
|
):
|
|
style_to_list[style_id] = "number"
|
|
except Exception:
|
|
pass
|
|
|
|
document_file = safe_open_zip(zip_file, "word/document.xml")
|
|
if not document_file:
|
|
return None, "document.xml 不存在或无法访问"
|
|
|
|
root = ET.parse(document_file).getroot()
|
|
body = root.find(".//w:body", namespaces=namespaces)
|
|
if body is None:
|
|
return None, "document.xml 中未找到 w:body 元素"
|
|
|
|
for child in body.findall("./*", namespaces=namespaces):
|
|
if child.tag.endswith("}p"):
|
|
style_elem = child.find(".//w:pStyle", namespaces=namespaces)
|
|
style_id = (
|
|
style_elem.get(f"{{{word_namespace}}}val")
|
|
if style_elem is not None
|
|
else None
|
|
)
|
|
|
|
heading_level = get_heading_level(style_id, style_to_level)
|
|
list_style = get_list_style(style_id, style_to_list)
|
|
para_text = extract_text_with_formatting(child, namespaces)
|
|
|
|
if para_text:
|
|
if heading_level > 0:
|
|
markdown_lines.append(f"{'#' * heading_level} {para_text}")
|
|
elif list_style == "bullet":
|
|
markdown_lines.append(f"- {para_text}")
|
|
elif list_style == "number":
|
|
markdown_lines.append(f"1. {para_text}")
|
|
else:
|
|
markdown_lines.append(para_text)
|
|
markdown_lines.append("")
|
|
|
|
elif child.tag.endswith("}tbl"):
|
|
table_md = convert_table_to_markdown(child, namespaces)
|
|
if table_md:
|
|
markdown_lines.append(table_md)
|
|
markdown_lines.append("")
|
|
|
|
content = "\n".join(markdown_lines)
|
|
if not content.strip():
|
|
return None, "文档为空"
|
|
return content, None
|
|
except Exception as e:
|
|
return None, f"XML 解析失败: {str(e)}"
|