lyxy-document/scripts/readers/pdf/unstructured.py

"""使用 unstructured 库解析 PDF 文件（fast 策略）"""

from typing import Optional, Tuple

from scripts.readers._utils import convert_unstructured_to_markdown


def parse(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    """使用 unstructured 库解析 PDF 文件（fast 策略）"""
    try:
        from unstructured.partition.pdf import partition_pdf
    except ImportError:
        return None, "unstructured 库未安装"

    try:
        elements = partition_pdf(
            filename=file_path,
            infer_table_structure=True,
            strategy="fast",
            languages=["chi_sim"],
        )
        # fast 策略不做版面分析，Title 类型标注不可靠
        content = convert_unstructured_to_markdown(elements, trust_titles=False)
        if not content.strip():
            return None, "文档为空"
        return content, None
    except Exception as e:
        return None, f"unstructured 解析失败: {str(e)}"