#!/usr/bin/env python3 """文档解析器命令行交互模块,提供命令行接口。""" import argparse import os import sys import common import docx import pptx import xlsx def main() -> None: parser = argparse.ArgumentParser( description="将 DOCX、PPTX 或 XLSX 文件解析为 Markdown" ) parser.add_argument("file_path", help="DOCX、PPTX 或 XLSX 文件的绝对路径") parser.add_argument( "-n", "--context", type=int, default=2, help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)", ) group = parser.add_mutually_exclusive_group() group.add_argument( "-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数" ) group.add_argument( "-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数" ) group.add_argument( "-t", "--titles", action="store_true", help="返回解析后的 markdown 文档的标题行(1-6级)", ) group.add_argument( "-tc", "--title-content", help="指定标题名称,输出该标题及其下级内容(不包含#号)", ) group.add_argument( "-s", "--search", help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)", ) args = parser.parse_args() if not os.path.exists(args.file_path): print(f"错误: 文件不存在: {args.file_path}") sys.exit(1) file_type = common.detect_file_type(args.file_path) if not file_type: print(f"错误: 不是有效的 DOCX、PPTX 或 XLSX 格式: {args.file_path}") sys.exit(1) if file_type == "docx": parsers = [ ("MarkItDown", docx.parse_docx_with_markitdown), ("python-docx", docx.parse_docx_with_python_docx), ("XML 原生解析", docx.parse_docx_with_xml), ] elif file_type == "pptx": parsers = [ ("MarkItDown", pptx.parse_pptx_with_markitdown), ("python-pptx", pptx.parse_pptx_with_python_pptx), ("XML 原生解析", pptx.parse_pptx_with_xml), ] else: parsers = [ ("MarkItDown", xlsx.parse_xlsx_with_markitdown), ("pandas", xlsx.parse_xlsx_with_pandas), ("XML 原生解析", xlsx.parse_xlsx_with_xml), ] failures = [] content = None for parser_name, parser_func in parsers: content, error = parser_func(args.file_path) if content is not None: content = common.remove_markdown_images(content) content = common.normalize_markdown_whitespace(content) break else: failures.append(f"- {parser_name}: {error}") if content is None: print("所有解析方法均失败:") for failure in failures: print(failure) sys.exit(1) if args.count: print(len(content.replace("\n", ""))) elif args.lines: print(len(content.split("\n"))) elif args.titles: titles = common.extract_titles(content) for title in titles: print(title) elif args.title_content: title_content = common.extract_title_content(content, args.title_content) if title_content is None: print(f"错误: 未找到标题 '{args.title_content}'") sys.exit(1) print(title_content, end="") elif args.search: search_result = common.search_markdown(content, args.search, args.context) if search_result is None: print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'") sys.exit(1) print(search_result, end="") else: print(content, end="") if __name__ == "__main__": main()