lyxy-document/scripts/readers/pdf/unstructured_ocr.py

"""使用 unstructured 库解析 PDF 文件（hi_res 策略 + PaddleOCR）"""

from typing import Optional, Tuple

from scripts.core import _unstructured_elements_to_markdown


def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 unstructured 库解析 PDF 文件（hi_res 策略 + PaddleOCR）"""
    try:
        from unstructured.partition.pdf import partition_pdf
    except ImportError:
        return None, "unstructured 库未安装"

    try:
        from unstructured.partition.utils.constants import OCR_AGENT_PADDLE
    except ImportError:
        return None, "unstructured-paddleocr 库未安装"

    try:
        elements = partition_pdf(
            filename=file_path,
            infer_table_structure=True,
            strategy="hi_res",
            languages=["chi_sim"],
            ocr_agent=OCR_AGENT_PADDLE,
            table_ocr_agent=OCR_AGENT_PADDLE,
        )
        content = _unstructured_elements_to_markdown(elements, trust_titles=True)
        if not content.strip():
            return None, "文档为空"
        return content, None
    except Exception as e:
        return None, f"unstructured OCR 解析失败: {str(e)}"