增加参数控制是否使用ocr模式解析pdf
This commit is contained in:
@@ -2,8 +2,17 @@
|
||||
"""文档解析器命令行交互模块,提供命令行接口。支持 DOCX、PPTX、XLSX 和 PDF 文件。"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# 抑制第三方库的进度条和日志,仅保留解析结果输出
|
||||
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||
os.environ["TQDM_DISABLE"] = "1"
|
||||
warnings.filterwarnings("ignore")
|
||||
logging.disable(logging.WARNING)
|
||||
|
||||
import common
|
||||
import docx_parser
|
||||
@@ -27,6 +36,12 @@ def main() -> None:
|
||||
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--high-res",
|
||||
action="store_true",
|
||||
help="PDF 解析时启用 OCR 版面分析(需要额外依赖,处理较慢)",
|
||||
)
|
||||
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
||||
@@ -88,12 +103,22 @@ def main() -> None:
|
||||
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
|
||||
]
|
||||
else:
|
||||
parsers = [
|
||||
("docling", pdf_parser.parse_pdf_with_docling),
|
||||
("unstructured", pdf_parser.parse_pdf_with_unstructured),
|
||||
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
|
||||
("pypdf", pdf_parser.parse_pdf_with_pypdf),
|
||||
]
|
||||
if args.high_res:
|
||||
parsers = [
|
||||
("docling OCR", pdf_parser.parse_pdf_with_docling_ocr),
|
||||
("unstructured OCR", pdf_parser.parse_pdf_with_unstructured_ocr),
|
||||
("docling", pdf_parser.parse_pdf_with_docling),
|
||||
("unstructured", pdf_parser.parse_pdf_with_unstructured),
|
||||
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
|
||||
("pypdf", pdf_parser.parse_pdf_with_pypdf),
|
||||
]
|
||||
else:
|
||||
parsers = [
|
||||
("docling", pdf_parser.parse_pdf_with_docling),
|
||||
("unstructured", pdf_parser.parse_pdf_with_unstructured),
|
||||
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
|
||||
("pypdf", pdf_parser.parse_pdf_with_pypdf),
|
||||
]
|
||||
|
||||
failures = []
|
||||
content = None
|
||||
|
||||
Reference in New Issue
Block a user