feat: 新增 PPT 旧格式支持,重构 LibreOffice 转换工具
- 新增 PPT (旧格式) 解析器 - 重构 _utils.py,提取通用 convert_via_libreoffice 函数 - 更新依赖配置,添加 PPT 相关依赖 - 完善文档,更新 README 和 SKILL.md - 添加 PPT 文件检测函数 - 新增 PPT 解析器测试用例
This commit is contained in:
37
scripts/readers/ppt/libreoffice.py
Normal file
37
scripts/readers/ppt/libreoffice.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""使用 LibreOffice soffice 命令行转换 PPT 为 PPTX 后复用 PptxReader 解析"""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from readers._utils import convert_via_libreoffice
|
||||
from readers.pptx import PptxReader
|
||||
|
||||
|
||||
def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 LibreOffice soffice 解析 PPT 文件
|
||||
|
||||
Args:
|
||||
file_path: PPT 文件路径
|
||||
|
||||
Returns:
|
||||
(markdown_content, error_message): 成功时 (content, None),失败时 (None, error)
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# 将 PPT 转换为 PPTX
|
||||
pptx_path, error = convert_via_libreoffice(
|
||||
input_path=file_path,
|
||||
target_format="pptx",
|
||||
output_dir=Path(temp_dir),
|
||||
timeout=60
|
||||
)
|
||||
if error:
|
||||
return None, error
|
||||
|
||||
# 复用 PptxReader 解析转换后的 PPTX
|
||||
reader = PptxReader()
|
||||
content, failures = reader.parse(str(pptx_path))
|
||||
if content is not None:
|
||||
return content, None
|
||||
else:
|
||||
return None, f"转换成功但 PPTX 解析失败: {failures}"
|
||||
Reference in New Issue
Block a user