"""使用 docling 库解析 PDF 文件(不启用 OCR)""" from typing import Optional, Tuple def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 docling 库解析 PDF 文件(不启用 OCR)""" try: from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption except ImportError: return None, "docling 库未安装" try: converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=PdfPipelineOptions(do_ocr=False) ) } ) result = converter.convert(file_path) markdown_content = result.document.export_to_markdown() if not markdown_content.strip(): return None, "文档为空" return markdown_content, None except Exception as e: return None, f"docling 解析失败: {str(e)}"