Files
lyxy-document/scripts/utils/file_detection.py
lanyuanxiaoyao a490b2642c feat: 新增 PPT 旧格式支持,重构 LibreOffice 转换工具
- 新增 PPT (旧格式) 解析器
- 重构 _utils.py,提取通用 convert_via_libreoffice 函数
- 更新依赖配置,添加 PPT 相关依赖
- 完善文档,更新 README 和 SKILL.md
- 添加 PPT 文件检测函数
- 新增 PPT 解析器测试用例
2026-03-16 22:49:04 +08:00

85 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""文件类型检测模块,用于验证和检测输入文件类型。"""
import os
import zipfile
from typing import List, Optional
def _is_valid_ole(file_path: str) -> bool:
"""验证 OLE2 格式文件XLS/DOC"""
try:
import olefile
except ImportError:
# 如果 olefile 未安装,就不做严格验证
return True
try:
return olefile.isOleFile(file_path)
except Exception:
return False
def _is_valid_ooxml(file_path: str, required_files: List[str]) -> bool:
"""验证 OOXML 格式文件DOCX/PPTX/XLSX"""
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
names = set(zip_file.namelist())
return all(r in names for r in required_files)
except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False
_DOCX_REQUIRED = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
_PPTX_REQUIRED = ["[Content_Types].xml", "_rels/.rels", "ppt/presentation.xml"]
_XLSX_REQUIRED = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
def is_valid_docx(file_path: str) -> bool:
"""验证文件是否为有效的 DOCX 格式"""
return _is_valid_ooxml(file_path, _DOCX_REQUIRED)
def is_valid_pptx(file_path: str) -> bool:
"""验证文件是否为有效的 PPTX 格式"""
return _is_valid_ooxml(file_path, _PPTX_REQUIRED)
def is_valid_xlsx(file_path: str) -> bool:
"""验证文件是否为有效的 XLSX 格式"""
return _is_valid_ooxml(file_path, _XLSX_REQUIRED)
def is_valid_xls(file_path: str) -> bool:
"""验证文件是否为有效的 XLS 格式"""
return _is_valid_ole(file_path)
def is_valid_doc(file_path: str) -> bool:
"""验证文件是否为有效的 DOC 格式OLE2"""
return _is_valid_ole(file_path)
def is_valid_ppt(file_path: str) -> bool:
"""验证文件是否为有效的 PPT 格式OLE2"""
return _is_valid_ole(file_path)
def is_valid_pdf(file_path: str) -> bool:
"""验证文件是否为有效的 PDF 格式"""
try:
with open(file_path, "rb") as f:
header = f.read(4)
return header == b"%PDF"
except (IOError, OSError):
return False
def is_html_file(file_path: str) -> bool:
"""判断文件是否为 HTML 文件(仅检查扩展名)"""
ext = file_path.lower()
return ext.endswith(".html") or ext.endswith(".htm")
def is_url(input_str: str) -> bool:
"""判断输入是否为 URL"""
return input_str.startswith("http://") or input_str.startswith("https://")