Skill/temp/scripts/parser.py

#!/usr/bin/env python3
"""文档解析器命令行交互模块，提供命令行接口。支持 DOCX、PPTX、XLSX 和 PDF 文件。"""

import argparse
import os
import sys

import common
import docx_parser
import pdf_parser
import pptx_parser
import xlsx_parser


def main() -> None:
    parser = argparse.ArgumentParser(
        description="将 DOCX、PPTX、XLSX 或 PDF 文件解析为 Markdown"
    )

    parser.add_argument("file_path", help="DOCX、PPTX、XLSX 或 PDF 文件的绝对路径")

    parser.add_argument(
        "-n",
        "--context",
        type=int,
        default=2,
        help="与 -s 配合使用，指定每个检索结果包含的前后行数（不包含空行）",
    )

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
    )
    group.add_argument(
        "-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
    )
    group.add_argument(
        "-t",
        "--titles",
        action="store_true",
        help="返回解析后的 markdown 文档的标题行（1-6级）",
    )
    group.add_argument(
        "-tc",
        "--title-content",
        help="指定标题名称，输出该标题及其下级内容（不包含#号）",
    )
    group.add_argument(
        "-s",
        "--search",
        help="使用正则表达式搜索文档，返回所有匹配结果（用---分隔）",
    )

    args = parser.parse_args()

    if not os.path.exists(args.file_path):
        print(f"错误: 文件不存在: {args.file_path}")
        sys.exit(1)

    file_type = common.detect_file_type(args.file_path)
    if not file_type:
        print(f"错误: 不是有效的 DOCX、PPTX、XLSX 或 PDF 格式: {args.file_path}")
        sys.exit(1)

    if file_type == "docx":
        parsers = [
            ("MarkItDown", docx_parser.parse_docx_with_markitdown),
            ("python-docx", docx_parser.parse_docx_with_python_docx),
            ("XML 原生解析", docx_parser.parse_docx_with_xml),
        ]
    elif file_type == "pptx":
        parsers = [
            ("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
            ("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
            ("XML 原生解析", pptx_parser.parse_pptx_with_xml),
        ]
    elif file_type == "xlsx":
        parsers = [
            ("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
            ("pandas", xlsx_parser.parse_xlsx_with_pandas),
            ("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
        ]
    else:
        parsers = [
            ("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
            ("unstructured", pdf_parser.parse_pdf_with_unstructured),
            ("pypdf", pdf_parser.parse_pdf_with_pypdf),
        ]

    failures = []
    content = None

    for parser_name, parser_func in parsers:
        content, error = parser_func(args.file_path)
        if content is not None:
            content = common.remove_markdown_images(content)
            content = common.normalize_markdown_whitespace(content)
            break
        else:
            failures.append(f"- {parser_name}: {error}")

    if content is None:
        print("所有解析方法均失败:")
        for failure in failures:
            print(failure)
        sys.exit(1)

    if args.count:
        print(len(content.replace("\n", "")))
    elif args.lines:
        print(len(content.split("\n")))
    elif args.titles:
        titles = common.extract_titles(content)
        for title in titles:
            print(title)
    elif args.title_content:
        title_content = common.extract_title_content(content, args.title_content)
        if title_content is None:
            print(f"错误: 未找到标题 '{args.title_content}'")
            sys.exit(1)
        print(title_content, end="")
    elif args.search:
        search_result = common.search_markdown(content, args.search, args.context)
        if search_result is None:
            print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'")
            sys.exit(1)
        print(search_result, end="")
    else:
        print(content, end="")


if __name__ == "__main__":
    main()