增加lyxy-reader-office skill

2026-02-17 22:50:06 +08:00
parent 9f686270c2
commit 9f04dac50b
25 changed files with 609 additions and 1282 deletions
--- a/skills/lyxy-reader-office/scripts/parser.py
+++ b/skills/lyxy-reader-office/scripts/parser.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""文档解析器命令行交互模块，提供命令行接口。支持 DOCX、PPTX、XLSX 和 PDF 文件。"""
+
+import argparse
+import logging
+import os
+import sys
+import warnings
+
+# 抑制第三方库的进度条和日志，仅保留解析结果输出
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+os.environ["TQDM_DISABLE"] = "1"
+warnings.filterwarnings("ignore")
+logging.disable(logging.WARNING)
+
+import common
+import docx_parser
+import pdf_parser
+import pptx_parser
+import xlsx_parser
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="将 DOCX、PPTX、XLSX 或 PDF 文件解析为 Markdown"
+    )
+
+    parser.add_argument("file_path", help="DOCX、PPTX、XLSX 或 PDF 文件的绝对路径")
+
+    parser.add_argument(
+        "-n",
+        "--context",
+        type=int,
+        default=2,
+        help="与 -s 配合使用，指定每个检索结果包含的前后行数（不包含空行）",
+    )
+
+    parser.add_argument(
+        "--high-res",
+        action="store_true",
+        help="PDF 解析时启用 OCR 版面分析（需要额外依赖，处理较慢）",
+    )
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
+    )
+    group.add_argument(
+        "-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
+    )
+    group.add_argument(
+        "-t",
+        "--titles",
+        action="store_true",
+        help="返回解析后的 markdown 文档的标题行（1-6级）",
+    )
+    group.add_argument(
+        "-tc",
+        "--title-content",
+        help="指定标题名称，输出该标题及其下级内容（不包含#号）",
+    )
+    group.add_argument(
+        "-s",
+        "--search",
+        help="使用正则表达式搜索文档，返回所有匹配结果（用---分隔）",
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.file_path):
+        print(f"错误: 文件不存在: {args.file_path}")
+        sys.exit(1)
+
+    file_type = common.detect_file_type(args.file_path)
+    if not file_type:
+        print(f"错误: 不是有效的 DOCX、PPTX、XLSX 或 PDF 格式: {args.file_path}")
+        sys.exit(1)
+
+    if file_type == "docx":
+        parsers = [
+            ("docling", docx_parser.parse_docx_with_docling),
+            ("unstructured", docx_parser.parse_docx_with_unstructured),
+            ("pypandoc-binary", docx_parser.parse_docx_with_pypandoc),
+            ("MarkItDown", docx_parser.parse_docx_with_markitdown),
+            ("python-docx", docx_parser.parse_docx_with_python_docx),
+            ("XML 原生解析", docx_parser.parse_docx_with_xml),
+        ]
+    elif file_type == "pptx":
+        parsers = [
+            ("docling", pptx_parser.parse_pptx_with_docling),
+            ("unstructured", pptx_parser.parse_pptx_with_unstructured),
+            ("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
+            ("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
+            ("XML 原生解析", pptx_parser.parse_pptx_with_xml),
+        ]
+    elif file_type == "xlsx":
+        parsers = [
+            ("docling", xlsx_parser.parse_xlsx_with_docling),
+            ("unstructured", xlsx_parser.parse_xlsx_with_unstructured),
+            ("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
+            ("pandas", xlsx_parser.parse_xlsx_with_pandas),
+            ("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
+        ]
+    else:
+        if args.high_res:
+            parsers = [
+                ("docling OCR", pdf_parser.parse_pdf_with_docling_ocr),
+                ("unstructured OCR", pdf_parser.parse_pdf_with_unstructured_ocr),
+                ("docling", pdf_parser.parse_pdf_with_docling),
+                ("unstructured", pdf_parser.parse_pdf_with_unstructured),
+                ("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
+                ("pypdf", pdf_parser.parse_pdf_with_pypdf),
+            ]
+        else:
+            parsers = [
+                ("docling", pdf_parser.parse_pdf_with_docling),
+                ("unstructured", pdf_parser.parse_pdf_with_unstructured),
+                ("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
+                ("pypdf", pdf_parser.parse_pdf_with_pypdf),
+            ]
+
+    failures = []
+    content = None
+
+    for parser_name, parser_func in parsers:
+        content, error = parser_func(args.file_path)
+        if content is not None:
+            content = common.remove_markdown_images(content)
+            content = common.normalize_markdown_whitespace(content)
+            break
+        else:
+            failures.append(f"- {parser_name}: {error}")
+
+    if content is None:
+        print("所有解析方法均失败:")
+        for failure in failures:
+            print(failure)
+        sys.exit(1)
+
+    if args.count:
+        print(len(content.replace("\n", "")))
+    elif args.lines:
+        print(len(content.split("\n")))
+    elif args.titles:
+        titles = common.extract_titles(content)
+        for title in titles:
+            print(title)
+    elif args.title_content:
+        title_content = common.extract_title_content(content, args.title_content)
+        if title_content is None:
+            print(f"错误: 未找到标题 '{args.title_content}'")
+            sys.exit(1)
+        print(title_content, end="")
+    elif args.search:
+        search_result = common.search_markdown(content, args.search, args.context)
+        if search_result is None:
+            print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
+            sys.exit(1)
+        print(search_result, end="")
+    else:
+        print(content, end="")
+
+
+if __name__ == "__main__":
+    main()