"""使用 unstructured 库解析 PDF 文件(hi_res 策略 + PaddleOCR)""" from typing import Optional, Tuple from core import _unstructured_elements_to_markdown def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 unstructured 库解析 PDF 文件(hi_res 策略 + PaddleOCR)""" try: from unstructured.partition.pdf import partition_pdf except ImportError: return None, "unstructured 库未安装" try: from unstructured.partition.utils.constants import OCR_AGENT_PADDLE except ImportError: return None, "unstructured-paddleocr 库未安装" try: elements = partition_pdf( filename=file_path, infer_table_structure=True, strategy="hi_res", languages=["chi_sim"], ocr_agent=OCR_AGENT_PADDLE, table_ocr_agent=OCR_AGENT_PADDLE, ) content = _unstructured_elements_to_markdown(elements, trust_titles=True) if not content.strip(): return None, "文档为空" return content, None except Exception as e: return None, f"unstructured OCR 解析失败: {str(e)}"