1
0

增加pdf文件的读取

This commit is contained in:
2026-02-14 23:20:47 +08:00
parent 8c27b08fdc
commit b022ac736b
4 changed files with 201 additions and 14 deletions

View File

@@ -114,6 +114,16 @@ def is_valid_xlsx(file_path: str) -> bool:
return False
def is_valid_pdf(file_path: str) -> bool:
"""验证文件是否为有效的 PDF 格式"""
try:
with open(file_path, "rb") as f:
header = f.read(4)
return header == b"%PDF"
except (IOError, OSError):
return False
def remove_markdown_images(markdown_text: str) -> str:
"""移除 Markdown 文本中的图片标记"""
return IMAGE_PATTERN.sub("", markdown_text)
@@ -286,7 +296,7 @@ def search_markdown(
def detect_file_type(file_path: str) -> Optional[str]:
"""检测文件类型,返回 'docx''pptx''xlsx'"""
"""检测文件类型,返回 'docx''pptx''xlsx''pdf'"""
_, ext = os.path.splitext(file_path)
ext = ext.lower()
@@ -299,5 +309,8 @@ def detect_file_type(file_path: str) -> Optional[str]:
elif ext == ".xlsx":
if is_valid_xlsx(file_path):
return "xlsx"
elif ext == ".pdf":
if is_valid_pdf(file_path):
return "pdf"
return None