Files
lyxy-document/scripts/utils/file_detection.py
lanyuanxiaoyao cf10458dd6 feat: 添加 doc/xls/ppt 旧格式文档支持
- 新增 DocReader,支持 markitdown 和 pypandoc-binary 解析器
- 新增 XlsReader,支持 unstructured、markitdown 和 pandas+xlrd 解析器
- 新增 PptReader,支持 markitdown 解析器
- 添加 olefile 依赖用于验证 OLE2 格式
- 更新 config.py 添加 doc/xls/ppt 依赖配置
- 更新 --advice 支持 doc/xls/ppt 格式
- 添加相应的测试用例
- 同步 specs 到主目录
2026-03-10 23:09:13 +08:00

107 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""文件类型检测模块,用于验证和检测输入文件类型。"""
import os
import zipfile
from typing import List, Optional
def _is_valid_ole(file_path: str) -> bool:
"""验证 OLE2 格式文件DOC/XLS/PPT"""
try:
import olefile
except ImportError:
# 如果 olefile 未安装,就不做严格验证
return True
try:
return olefile.isOleFile(file_path)
except Exception:
return False
def _is_valid_ooxml(file_path: str, required_files: List[str]) -> bool:
"""验证 OOXML 格式文件DOCX/PPTX/XLSX"""
try:
with zipfile.ZipFile(file_path, "r") as zip_file:
names = set(zip_file.namelist())
return all(r in names for r in required_files)
except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False
_DOCX_REQUIRED = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
_PPTX_REQUIRED = ["[Content_Types].xml", "_rels/.rels", "ppt/presentation.xml"]
_XLSX_REQUIRED = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
def is_valid_docx(file_path: str) -> bool:
"""验证文件是否为有效的 DOCX 格式"""
return _is_valid_ooxml(file_path, _DOCX_REQUIRED)
def is_valid_pptx(file_path: str) -> bool:
"""验证文件是否为有效的 PPTX 格式"""
return _is_valid_ooxml(file_path, _PPTX_REQUIRED)
def is_valid_xlsx(file_path: str) -> bool:
"""验证文件是否为有效的 XLSX 格式"""
return _is_valid_ooxml(file_path, _XLSX_REQUIRED)
def is_valid_doc(file_path: str) -> bool:
"""验证文件是否为有效的 DOC 格式"""
return _is_valid_ole(file_path)
def is_valid_xls(file_path: str) -> bool:
"""验证文件是否为有效的 XLS 格式"""
return _is_valid_ole(file_path)
def is_valid_ppt(file_path: str) -> bool:
"""验证文件是否为有效的 PPT 格式"""
return _is_valid_ole(file_path)
def is_valid_pdf(file_path: str) -> bool:
"""验证文件是否为有效的 PDF 格式"""
try:
with open(file_path, "rb") as f:
header = f.read(4)
return header == b"%PDF"
except (IOError, OSError):
return False
def is_html_file(file_path: str) -> bool:
"""判断文件是否为 HTML 文件(仅检查扩展名)"""
ext = file_path.lower()
return ext.endswith(".html") or ext.endswith(".htm")
def is_url(input_str: str) -> bool:
"""判断输入是否为 URL"""
return input_str.startswith("http://") or input_str.startswith("https://")
_FILE_TYPE_VALIDATORS = {
".docx": is_valid_docx,
".pptx": is_valid_pptx,
".xlsx": is_valid_xlsx,
".pdf": is_valid_pdf,
".doc": is_valid_doc,
".xls": is_valid_xls,
".ppt": is_valid_ppt,
}
def detect_file_type(file_path: str) -> Optional[str]:
"""检测文件类型,返回 'docx''pptx''xlsx''pdf''doc''xls''ppt'"""
ext = os.path.splitext(file_path)[1].lower()
validator = _FILE_TYPE_VALIDATORS.get(ext)
if validator and validator(file_path):
return ext.lstrip(".")
return None