完成多文档读取的脚本
This commit is contained in:
268
temp/scripts/docx.py
Normal file
268
temp/scripts/docx.py
Normal file
@@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env python3
|
||||
"""DOCX 文件解析模块,提供三种解析方法。"""
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from common import build_markdown_table, safe_open_zip
|
||||
|
||||
|
||||
def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析 DOCX 文件"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
if not result.text_content.strip():
|
||||
return None, "文档为空"
|
||||
return result.text_content, None
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 python-docx 库解析 DOCX 文件"""
|
||||
try:
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
return None, "python-docx 库未安装"
|
||||
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
|
||||
def get_heading_level(para: Any) -> int:
|
||||
if para.style and para.style.name:
|
||||
style_name = para.style.name
|
||||
if "Heading 1" in style_name or "Title" in style_name:
|
||||
return 1
|
||||
elif "Heading 2" in style_name:
|
||||
return 2
|
||||
elif "Heading 3" in style_name:
|
||||
return 3
|
||||
elif "Heading 4" in style_name:
|
||||
return 4
|
||||
elif "Heading 5" in style_name:
|
||||
return 5
|
||||
elif "Heading 6" in style_name:
|
||||
return 6
|
||||
return 0
|
||||
|
||||
def get_list_style(para: Any) -> Optional[str]:
|
||||
if not para.style or not para.style.name:
|
||||
return None
|
||||
style_name = para.style.name
|
||||
if style_name.startswith("List Bullet") or style_name == "Bullet":
|
||||
return "bullet"
|
||||
elif style_name.startswith("List Number") or style_name == "Number":
|
||||
return "number"
|
||||
return None
|
||||
|
||||
def convert_runs_to_markdown(runs: List[Any]) -> str:
|
||||
result = []
|
||||
for run in runs:
|
||||
text = run.text
|
||||
if not text:
|
||||
continue
|
||||
if run.bold:
|
||||
text = f"**{text}**"
|
||||
if run.italic:
|
||||
text = f"*{text}*"
|
||||
if run.underline:
|
||||
text = f"<u>{text}</u>"
|
||||
result.append(text)
|
||||
return "".join(result)
|
||||
|
||||
def convert_table_to_markdown(table: Any) -> str:
|
||||
rows_data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
cell_text = cell.text.strip().replace("\n", " ")
|
||||
row_data.append(cell_text)
|
||||
rows_data.append(row_data)
|
||||
return build_markdown_table(rows_data)
|
||||
|
||||
markdown_lines = []
|
||||
prev_was_list = False
|
||||
|
||||
for para in doc.paragraphs:
|
||||
text = convert_runs_to_markdown(para.runs)
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
heading_level = get_heading_level(para)
|
||||
if heading_level > 0:
|
||||
markdown_lines.append(f"{'#' * heading_level} {text}")
|
||||
prev_was_list = False
|
||||
else:
|
||||
list_style = get_list_style(para)
|
||||
if list_style == "bullet":
|
||||
if not prev_was_list and markdown_lines:
|
||||
markdown_lines.append("")
|
||||
markdown_lines.append(f"- {text}")
|
||||
prev_was_list = True
|
||||
elif list_style == "number":
|
||||
if not prev_was_list and markdown_lines:
|
||||
markdown_lines.append("")
|
||||
markdown_lines.append(f"1. {text}")
|
||||
prev_was_list = True
|
||||
else:
|
||||
if prev_was_list and markdown_lines:
|
||||
markdown_lines.append("")
|
||||
markdown_lines.append(text)
|
||||
markdown_lines.append("")
|
||||
prev_was_list = False
|
||||
|
||||
for table in doc.tables:
|
||||
table_md = convert_table_to_markdown(table)
|
||||
markdown_lines.append(table_md)
|
||||
markdown_lines.append("")
|
||||
|
||||
content = "\n".join(markdown_lines)
|
||||
if not content.strip():
|
||||
return None, "文档为空"
|
||||
return content, None
|
||||
except Exception as e:
|
||||
return None, f"python-docx 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 XML 原生解析 DOCX 文件"""
|
||||
word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
namespaces = {"w": word_namespace}
|
||||
|
||||
def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int:
|
||||
return style_to_level.get(style_id, 0)
|
||||
|
||||
def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]:
|
||||
return style_to_list.get(style_id, None)
|
||||
|
||||
def extract_text_with_formatting(para: Any, namespaces: dict) -> str:
|
||||
texts = []
|
||||
for run in para.findall(".//w:r", namespaces=namespaces):
|
||||
text_elem = run.find(".//w:t", namespaces=namespaces)
|
||||
if text_elem is not None and text_elem.text:
|
||||
text = text_elem.text
|
||||
bold = run.find(".//w:b", namespaces=namespaces) is not None
|
||||
italic = run.find(".//w:i", namespaces=namespaces) is not None
|
||||
if bold:
|
||||
text = f"**{text}**"
|
||||
if italic:
|
||||
text = f"*{text}*"
|
||||
texts.append(text)
|
||||
return "".join(texts).strip()
|
||||
|
||||
def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str:
|
||||
rows = table_elem.findall(".//w:tr", namespaces=namespaces)
|
||||
if not rows:
|
||||
return ""
|
||||
md_lines = []
|
||||
for i, row in enumerate(rows):
|
||||
cells = row.findall(".//w:tc", namespaces=namespaces)
|
||||
cell_texts = []
|
||||
for cell in cells:
|
||||
cell_text = extract_text_with_formatting(cell, namespaces)
|
||||
cell_text = cell_text.replace("\n", " ").strip()
|
||||
cell_texts.append(cell_text if cell_text else "")
|
||||
if cell_texts:
|
||||
md_line = "| " + " | ".join(cell_texts) + " |"
|
||||
md_lines.append(md_line)
|
||||
if i == 0:
|
||||
sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |"
|
||||
md_lines.append(sep_line)
|
||||
return "\n".join(md_lines)
|
||||
|
||||
try:
|
||||
style_to_level = {}
|
||||
style_to_list = {}
|
||||
markdown_lines = []
|
||||
|
||||
with zipfile.ZipFile(file_path) as zip_file:
|
||||
try:
|
||||
styles_file = safe_open_zip(zip_file, "word/styles.xml")
|
||||
if styles_file:
|
||||
styles_root = ET.parse(styles_file).getroot()
|
||||
for style in styles_root.findall(
|
||||
".//w:style", namespaces=namespaces
|
||||
):
|
||||
style_id = style.get(f"{{{word_namespace}}}styleId")
|
||||
style_name_elem = style.find("w:name", namespaces=namespaces)
|
||||
if style_id and style_name_elem is not None:
|
||||
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
|
||||
if style_name:
|
||||
if style_name == "Title":
|
||||
style_to_level[style_id] = 1
|
||||
elif style_name == "heading 1":
|
||||
style_to_level[style_id] = 1
|
||||
elif style_name == "heading 2":
|
||||
style_to_level[style_id] = 2
|
||||
elif style_name == "heading 3":
|
||||
style_to_level[style_id] = 3
|
||||
elif style_name == "heading 4":
|
||||
style_to_level[style_id] = 4
|
||||
elif style_name == "heading 5":
|
||||
style_to_level[style_id] = 5
|
||||
elif style_name == "heading 6":
|
||||
style_to_level[style_id] = 6
|
||||
elif (
|
||||
style_name.startswith("List Bullet")
|
||||
or style_name == "Bullet"
|
||||
):
|
||||
style_to_list[style_id] = "bullet"
|
||||
elif (
|
||||
style_name.startswith("List Number")
|
||||
or style_name == "Number"
|
||||
):
|
||||
style_to_list[style_id] = "number"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
document_file = safe_open_zip(zip_file, "word/document.xml")
|
||||
if not document_file:
|
||||
return None, "document.xml 不存在或无法访问"
|
||||
|
||||
root = ET.parse(document_file).getroot()
|
||||
body = root.find(".//w:body", namespaces=namespaces)
|
||||
if body is None:
|
||||
return None, "document.xml 中未找到 w:body 元素"
|
||||
|
||||
for child in body.findall("./*", namespaces=namespaces):
|
||||
if child.tag.endswith("}p"):
|
||||
style_elem = child.find(".//w:pStyle", namespaces=namespaces)
|
||||
style_id = (
|
||||
style_elem.get(f"{{{word_namespace}}}val")
|
||||
if style_elem is not None
|
||||
else None
|
||||
)
|
||||
|
||||
heading_level = get_heading_level(style_id, style_to_level)
|
||||
list_style = get_list_style(style_id, style_to_list)
|
||||
para_text = extract_text_with_formatting(child, namespaces)
|
||||
|
||||
if para_text:
|
||||
if heading_level > 0:
|
||||
markdown_lines.append(f"{'#' * heading_level} {para_text}")
|
||||
elif list_style == "bullet":
|
||||
markdown_lines.append(f"- {para_text}")
|
||||
elif list_style == "number":
|
||||
markdown_lines.append(f"1. {para_text}")
|
||||
else:
|
||||
markdown_lines.append(para_text)
|
||||
markdown_lines.append("")
|
||||
|
||||
elif child.tag.endswith("}tbl"):
|
||||
table_md = convert_table_to_markdown(child, namespaces)
|
||||
if table_md:
|
||||
markdown_lines.append(table_md)
|
||||
markdown_lines.append("")
|
||||
|
||||
content = "\n".join(markdown_lines)
|
||||
if not content.strip():
|
||||
return None, "文档为空"
|
||||
return content, None
|
||||
except Exception as e:
|
||||
return None, f"XML 解析失败: {str(e)}"
|
||||
Reference in New Issue
Block a user