1
0
Files
Skill/temp/scripts/pdf_parser.py

77 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""PDF 文件解析模块,提供多种解析方法。"""
from typing import Optional, Tuple
from common import _unstructured_elements_to_markdown, parse_with_docling, parse_with_markitdown
def parse_pdf_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 docling 库解析 PDF 文件"""
return parse_with_docling(file_path)
def parse_pdf_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 unstructured 库解析 PDF 文件,优先 hi_res 策略配合 PaddleOCR"""
try:
from unstructured.partition.pdf import partition_pdf
except ImportError:
return None, "unstructured 库未安装"
base_kwargs = {"filename": file_path, "infer_table_structure": True}
try:
# 优先 hi_res 策略(版面分析 + PaddleOCR失败则回退 fast
try:
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE
elements = partition_pdf(
**base_kwargs,
strategy="hi_res",
languages=["chi_sim"],
ocr_agent=OCR_AGENT_PADDLE,
table_ocr_agent=OCR_AGENT_PADDLE,
)
trust_titles = True
except Exception:
# fast 策略不做版面分析Title 类型标注不可靠
elements = partition_pdf(**base_kwargs, strategy="fast", languages=["chi_sim"])
trust_titles = False
content = _unstructured_elements_to_markdown(elements, trust_titles)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"unstructured 解析失败: {str(e)}"
def parse_pdf_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 库解析 PDF 文件"""
return parse_with_markitdown(file_path)
def parse_pdf_with_pypdf(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 pypdf 库解析 PDF 文件"""
try:
from pypdf import PdfReader
except ImportError:
return None, "pypdf 库未安装"
try:
reader = PdfReader(file_path)
md_content = []
for page in reader.pages:
text = page.extract_text(extraction_mode="plain")
if text and text.strip():
md_content.append(text.strip())
md_content.append("")
content = "\n".join(md_content).strip()
if not content:
return None, "文档为空"
return content, None
except Exception as e:
return None, f"pypdf 解析失败: {str(e)}"