"""PDF 文件阅读器,支持多种解析方法(OCR 优先)。""" import os from typing import List, Optional, Tuple from readers.base import BaseReader from utils import is_valid_pdf from . import docling_ocr from . import unstructured_ocr from . import docling from . import unstructured from . import markitdown from . import pypdf PARSERS = [ ("docling OCR", docling_ocr.parse), ("unstructured OCR", unstructured_ocr.parse), ("docling", docling.parse), ("unstructured", unstructured.parse), ("MarkItDown", markitdown.parse), ("pypdf", pypdf.parse), ] class PdfReader(BaseReader): """PDF 文件阅读器""" def supports(self, file_path: str) -> bool: return file_path.lower().endswith('.pdf') def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: failures = [] # 检查文件是否存在 if not os.path.exists(file_path): return None, ["文件不存在"] # 验证文件格式 if not is_valid_pdf(file_path): return None, ["不是有效的 PDF 文件"] content = None for parser_name, parser_func in PARSERS: try: content, error = parser_func(file_path) if content is not None: return content, failures else: failures.append(f"- {parser_name}: {error}") except Exception as e: failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}") return None, failures