1
0

增加lyxy-reader-office skill

This commit is contained in:
2026-02-17 22:50:06 +08:00
parent 9f686270c2
commit 9f04dac50b
25 changed files with 609 additions and 1282 deletions

View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""文档解析器命令行交互模块,提供命令行接口。支持 DOCX、PPTX、XLSX 和 PDF 文件。"""
import argparse
import logging
import os
import sys
import warnings
# 抑制第三方库的进度条和日志,仅保留解析结果输出
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TQDM_DISABLE"] = "1"
warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
import common
import docx_parser
import pdf_parser
import pptx_parser
import xlsx_parser
def main() -> None:
parser = argparse.ArgumentParser(
description="将 DOCX、PPTX、XLSX 或 PDF 文件解析为 Markdown"
)
parser.add_argument("file_path", help="DOCX、PPTX、XLSX 或 PDF 文件的绝对路径")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
parser.add_argument(
"--high-res",
action="store_true",
help="PDF 解析时启用 OCR 版面分析(需要额外依赖,处理较慢)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
if not os.path.exists(args.file_path):
print(f"错误: 文件不存在: {args.file_path}")
sys.exit(1)
file_type = common.detect_file_type(args.file_path)
if not file_type:
print(f"错误: 不是有效的 DOCX、PPTX、XLSX 或 PDF 格式: {args.file_path}")
sys.exit(1)
if file_type == "docx":
parsers = [
("docling", docx_parser.parse_docx_with_docling),
("unstructured", docx_parser.parse_docx_with_unstructured),
("pypandoc-binary", docx_parser.parse_docx_with_pypandoc),
("MarkItDown", docx_parser.parse_docx_with_markitdown),
("python-docx", docx_parser.parse_docx_with_python_docx),
("XML 原生解析", docx_parser.parse_docx_with_xml),
]
elif file_type == "pptx":
parsers = [
("docling", pptx_parser.parse_pptx_with_docling),
("unstructured", pptx_parser.parse_pptx_with_unstructured),
("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
("XML 原生解析", pptx_parser.parse_pptx_with_xml),
]
elif file_type == "xlsx":
parsers = [
("docling", xlsx_parser.parse_xlsx_with_docling),
("unstructured", xlsx_parser.parse_xlsx_with_unstructured),
("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
("pandas", xlsx_parser.parse_xlsx_with_pandas),
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
]
else:
if args.high_res:
parsers = [
("docling OCR", pdf_parser.parse_pdf_with_docling_ocr),
("unstructured OCR", pdf_parser.parse_pdf_with_unstructured_ocr),
("docling", pdf_parser.parse_pdf_with_docling),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]
else:
parsers = [
("docling", pdf_parser.parse_pdf_with_docling),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]
failures = []
content = None
for parser_name, parser_func in parsers:
content, error = parser_func(args.file_path)
if content is not None:
content = common.remove_markdown_images(content)
content = common.normalize_markdown_whitespace(content)
break
else:
failures.append(f"- {parser_name}: {error}")
if content is None:
print("所有解析方法均失败:")
for failure in failures:
print(failure)
sys.exit(1)
if args.count:
print(len(content.replace("\n", "")))
elif args.lines:
print(len(content.split("\n")))
elif args.titles:
titles = common.extract_titles(content)
for title in titles:
print(title)
elif args.title_content:
title_content = common.extract_title_content(content, args.title_content)
if title_content is None:
print(f"错误: 未找到标题 '{args.title_content}'")
sys.exit(1)
print(title_content, end="")
elif args.search:
search_result = common.search_markdown(content, args.search, args.context)
if search_result is None:
print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
sys.exit(1)
print(search_result, end="")
else:
print(content, end="")
if __name__ == "__main__":
main()