#!/usr/bin/env python3 """PDF 文件解析模块,提供三种解析方法。""" from typing import Optional, Tuple def parse_pdf_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 PDF 文件""" try: from markitdown import MarkItDown md = MarkItDown() result = md.convert(file_path) if not result.text_content.strip(): return None, "文档为空" return result.text_content, None except ImportError: return None, "MarkItDown 库未安装" except Exception as e: return None, f"MarkItDown 解析失败: {str(e)}" def parse_pdf_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 unstructured 库解析 PDF 文件""" try: from unstructured.partition.pdf import partition_pdf except ImportError: return None, "unstructured 库未安装" try: elements = partition_pdf( filename=file_path, strategy="fast", infer_table_structure=True, extract_images_in_pdf=False, ) md_lines = [] for element in elements: if hasattr(element, "text") and element.text and element.text.strip(): text = element.text.strip() md_lines.append(text) md_lines.append("") content = "\n".join(md_lines).strip() if not content: return None, "文档为空" return content, None except Exception as e: return None, f"unstructured 解析失败: {str(e)}" def parse_pdf_with_pypdf(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 pypdf 库解析 PDF 文件""" try: from pypdf import PdfReader except ImportError: return None, "pypdf 库未安装" try: reader = PdfReader(file_path) md_content = [] for page in reader.pages: text = page.extract_text(extraction_mode="plain") if text and text.strip(): md_content.append(text.strip()) md_content.append("") content = "\n".join(md_content).strip() if not content: return None, "文档为空" return content, None except Exception as e: return None, f"pypdf 解析失败: {str(e)}"