#!/usr/bin/env python3 """PDF 文件解析模块,提供多种解析方法。""" from typing import Optional, Tuple from common import _unstructured_elements_to_markdown, parse_with_markitdown def parse_pdf_with_docling(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 docling 库解析 PDF 文件(不启用 OCR)""" try: from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption except ImportError: return None, "docling 库未安装" try: converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=PdfPipelineOptions(do_ocr=False) ) } ) result = converter.convert(file_path) markdown_content = result.document.export_to_markdown() if not markdown_content.strip(): return None, "文档为空" return markdown_content, None except Exception as e: return None, f"docling 解析失败: {str(e)}" def parse_pdf_with_docling_ocr(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 docling 库解析 PDF 文件(启用 OCR)""" try: from docling.document_converter import DocumentConverter except ImportError: return None, "docling 库未安装" try: converter = DocumentConverter() result = converter.convert(file_path) markdown_content = result.document.export_to_markdown() if not markdown_content.strip(): return None, "文档为空" return markdown_content, None except Exception as e: return None, f"docling OCR 解析失败: {str(e)}" def parse_pdf_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 unstructured 库解析 PDF 文件(fast 策略)""" try: from unstructured.partition.pdf import partition_pdf except ImportError: return None, "unstructured 库未安装" try: elements = partition_pdf( filename=file_path, infer_table_structure=True, strategy="fast", languages=["chi_sim"], ) # fast 策略不做版面分析,Title 类型标注不可靠 content = _unstructured_elements_to_markdown(elements, trust_titles=False) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"unstructured 解析失败: {str(e)}" def parse_pdf_with_unstructured_ocr( file_path: str, ) -> Tuple[Optional[str], Optional[str]]: """使用 unstructured 库解析 PDF 文件(hi_res 策略 + PaddleOCR)""" try: from unstructured.partition.pdf import partition_pdf except ImportError: return None, "unstructured 库未安装" try: from unstructured.partition.utils.constants import OCR_AGENT_PADDLE except ImportError: return None, "unstructured-paddleocr 库未安装" try: elements = partition_pdf( filename=file_path, infer_table_structure=True, strategy="hi_res", languages=["chi_sim"], ocr_agent=OCR_AGENT_PADDLE, table_ocr_agent=OCR_AGENT_PADDLE, ) content = _unstructured_elements_to_markdown(elements, trust_titles=True) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"unstructured OCR 解析失败: {str(e)}" def parse_pdf_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 PDF 文件""" return parse_with_markitdown(file_path) def parse_pdf_with_pypdf(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 pypdf 库解析 PDF 文件""" try: from pypdf import PdfReader except ImportError: return None, "pypdf 库未安装" try: reader = PdfReader(file_path) md_content = [] for page in reader.pages: text = page.extract_text(extraction_mode="plain") if text and text.strip(): md_content.append(text.strip()) md_content.append("") content = "\n".join(md_content).strip() if not content: return None, "文档为空" return content, None except Exception as e: return None, f"pypdf 解析失败: {str(e)}"