1
0
Files
Skill/skills/lyxy-reader-office/scripts/pdf_parser.py

135 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""PDF 文件解析模块,提供多种解析方法。"""
from typing import Optional, Tuple
from common import _unstructured_elements_to_markdown, parse_with_markitdown
def parse_pdf_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 docling 库解析 PDF 文件(不启用 OCR"""
try:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
except ImportError:
return None, "docling 库未安装"
try:
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(do_ocr=False)
)
}
)
result = converter.convert(file_path)
markdown_content = result.document.export_to_markdown()
if not markdown_content.strip():
return None, "文档为空"
return markdown_content, None
except Exception as e:
return None, f"docling 解析失败: {str(e)}"
def parse_pdf_with_docling_ocr(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 docling 库解析 PDF 文件(启用 OCR"""
try:
from docling.document_converter import DocumentConverter
except ImportError:
return None, "docling 库未安装"
try:
converter = DocumentConverter()
result = converter.convert(file_path)
markdown_content = result.document.export_to_markdown()
if not markdown_content.strip():
return None, "文档为空"
return markdown_content, None
except Exception as e:
return None, f"docling OCR 解析失败: {str(e)}"
def parse_pdf_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 unstructured 库解析 PDF 文件fast 策略)"""
try:
from unstructured.partition.pdf import partition_pdf
except ImportError:
return None, "unstructured 库未安装"
try:
elements = partition_pdf(
filename=file_path,
infer_table_structure=True,
strategy="fast",
languages=["chi_sim"],
)
# fast 策略不做版面分析Title 类型标注不可靠
content = _unstructured_elements_to_markdown(elements, trust_titles=False)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"unstructured 解析失败: {str(e)}"
def parse_pdf_with_unstructured_ocr(
file_path: str,
) -> Tuple[Optional[str], Optional[str]]:
"""使用 unstructured 库解析 PDF 文件hi_res 策略 + PaddleOCR"""
try:
from unstructured.partition.pdf import partition_pdf
except ImportError:
return None, "unstructured 库未安装"
try:
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE
except ImportError:
return None, "unstructured-paddleocr 库未安装"
try:
elements = partition_pdf(
filename=file_path,
infer_table_structure=True,
strategy="hi_res",
languages=["chi_sim"],
ocr_agent=OCR_AGENT_PADDLE,
table_ocr_agent=OCR_AGENT_PADDLE,
)
content = _unstructured_elements_to_markdown(elements, trust_titles=True)
if not content.strip():
return None, "文档为空"
return content, None
except Exception as e:
return None, f"unstructured OCR 解析失败: {str(e)}"
def parse_pdf_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 库解析 PDF 文件"""
return parse_with_markitdown(file_path)
def parse_pdf_with_pypdf(file_path: str) -> Tuple[Optional[str], Optional[str]]:
"""使用 pypdf 库解析 PDF 文件"""
try:
from pypdf import PdfReader
except ImportError:
return None, "pypdf 库未安装"
try:
reader = PdfReader(file_path)
md_content = []
for page in reader.pages:
text = page.extract_text(extraction_mode="plain")
if text and text.strip():
md_content.append(text.strip())
md_content.append("")
content = "\n".join(md_content).strip()
if not content:
return None, "文档为空"
return content, None
except Exception as e:
return None, f"pypdf 解析失败: {str(e)}"