"""使用 LibreOffice soffice 命令行解析 DOCX 文件""" import subprocess import tempfile import shutil from pathlib import Path from typing import Optional, Tuple def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 LibreOffice soffice 解析 DOCX 文件""" # 检测 soffice 是否在 PATH 中 soffice_path = shutil.which("soffice") if not soffice_path: return None, "LibreOffice 未安装" # 创建临时输出目录 with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) input_path = Path(file_path) expected_output = temp_path / (input_path.stem + ".md") # 构建命令 cmd = [ soffice_path, "--headless", "--convert-to", "md", "--outdir", str(temp_path), str(input_path) ] # 执行命令,超时 60 秒 try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) except subprocess.TimeoutExpired: return None, "LibreOffice 转换超时 (60秒)" # 检查返回码 if result.returncode != 0: return None, f"LibreOffice 转换失败 (code: {result.returncode})" # 检查输出文件是否存在 output_file = None if expected_output.exists(): output_file = expected_output else: # Fallback: 遍历目录找任意 .md 文件 md_files = list(temp_path.glob("*.md")) if md_files: output_file = md_files[0] if not output_file: return None, "LibreOffice 未生成输出文件" # 读取输出内容 content = output_file.read_text(encoding="utf-8", errors="replace") content = content.strip() if not content: return None, "LibreOffice 输出为空" return content, None