1
0

增加参数控制是否使用ocr模式解析pdf

This commit is contained in:
2026-02-17 21:58:27 +08:00
parent c693e23888
commit a21f5063c8
3 changed files with 141 additions and 48 deletions

View File

@@ -2,8 +2,17 @@
"""文档解析器命令行交互模块,提供命令行接口。支持 DOCX、PPTX、XLSX 和 PDF 文件。"""
import argparse
import logging
import os
import sys
import warnings
# 抑制第三方库的进度条和日志,仅保留解析结果输出
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TQDM_DISABLE"] = "1"
warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
import common
import docx_parser
@@ -27,6 +36,12 @@ def main() -> None:
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
parser.add_argument(
"--high-res",
action="store_true",
help="PDF 解析时启用 OCR 版面分析(需要额外依赖,处理较慢)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
@@ -88,12 +103,22 @@ def main() -> None:
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
]
else:
parsers = [
("docling", pdf_parser.parse_pdf_with_docling),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]
if args.high_res:
parsers = [
("docling OCR", pdf_parser.parse_pdf_with_docling_ocr),
("unstructured OCR", pdf_parser.parse_pdf_with_unstructured_ocr),
("docling", pdf_parser.parse_pdf_with_docling),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]
else:
parsers = [
("docling", pdf_parser.parse_pdf_with_docling),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]
failures = []
content = None