Files
lyxy-document/scripts/readers/pdf/__init__.py
lanyuanxiaoyao 9daff73589 refactor: 调整模块导入路径,简化引用结构
- 更新 openspec/config.yaml 中 git 任务相关说明
- 将 scripts.core.* 改为 core.*,scripts.readers.* 改为 readers.*
- 优化 lyxy_document_reader.py 中 sys.path 设置方式
- 同步更新所有测试文件的导入路径
2026-03-09 15:44:51 +08:00

57 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""PDF 文件阅读器支持多种解析方法OCR 优先)。"""
import os
from typing import List, Optional, Tuple
from readers.base import BaseReader
from utils import is_valid_pdf
from . import docling_ocr
from . import unstructured_ocr
from . import docling
from . import unstructured
from . import markitdown
from . import pypdf
PARSERS = [
("docling OCR", docling_ocr.parse),
("unstructured OCR", unstructured_ocr.parse),
("docling", docling.parse),
("unstructured", unstructured.parse),
("MarkItDown", markitdown.parse),
("pypdf", pypdf.parse),
]
class PdfReader(BaseReader):
"""PDF 文件阅读器"""
def supports(self, file_path: str) -> bool:
return file_path.lower().endswith('.pdf')
def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]:
failures = []
# 检查文件是否存在
if not os.path.exists(file_path):
return None, ["文件不存在"]
# 验证文件格式
if not is_valid_pdf(file_path):
return None, ["不是有效的 PDF 文件"]
content = None
for parser_name, parser_func in PARSERS:
try:
content, error = parser_func(file_path)
if content is not None:
return content, failures
else:
failures.append(f"- {parser_name}: {error}")
except Exception as e:
failures.append(f"- {parser_name}: [意外异常] {type(e).__name__}: {str(e)}")
return None, failures