- 新增 scripts/core/advice_generator.py 建议生成器模块 - 在 config.py 中添加 DEPENDENCIES 依赖配置 - 在 lyxy_document_reader.py 中添加 -a/--advice 参数 - 复用 Reader 实例的 supports 方法检测文件类型 - 支持平台检测,对 macOS x86_64 PDF 返回特殊命令 - 添加单元测试和集成测试 - 更新 SKILL.md,引导优先使用 --advice 参数 - 更新 README.md,添加项目结构说明 - 添加 openspec/specs/cli-advice/spec.md 规范文档
125 lines
3.5 KiB
Python
125 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
||
"""文档解析器命令行交互模块,提供命令行接口。支持 DOCX、PPTX、XLSX、PDF、HTML 和 URL。"""
|
||
|
||
import argparse
|
||
import logging
|
||
import os
|
||
import sys
|
||
import warnings
|
||
from pathlib import Path
|
||
|
||
# 将 scripts/ 目录添加到 sys.path,支持从任意位置执行脚本
|
||
scripts_dir = Path(__file__).resolve().parent
|
||
if str(scripts_dir) not in sys.path:
|
||
sys.path.append(str(scripts_dir))
|
||
|
||
# 抑制第三方库的进度条和日志,仅保留解析结果输出
|
||
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||
os.environ["TQDM_DISABLE"] = "1"
|
||
warnings.filterwarnings("ignore")
|
||
|
||
# 配置日志系统,只输出 ERROR 级别
|
||
logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
|
||
|
||
# 设置第三方库日志等级
|
||
logging.getLogger('docling').setLevel(logging.ERROR)
|
||
logging.getLogger('unstructured').setLevel(logging.ERROR)
|
||
|
||
from core import (
|
||
FileDetectionError,
|
||
ReaderNotFoundError,
|
||
output_result,
|
||
parse_input,
|
||
process_content,
|
||
generate_advice,
|
||
)
|
||
from readers import READERS
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="将 DOCX、PPTX、XLSX、PDF、HTML 文件或 URL 解析为 Markdown"
|
||
)
|
||
|
||
parser.add_argument("input_path", help="DOCX、PPTX、XLSX、PDF、HTML 文件或 URL")
|
||
|
||
parser.add_argument(
|
||
"-a",
|
||
"--advice",
|
||
action="store_true",
|
||
help="仅显示执行建议,不实际解析文件",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"-n",
|
||
"--context",
|
||
type=int,
|
||
default=2,
|
||
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
||
)
|
||
|
||
group = parser.add_mutually_exclusive_group()
|
||
group.add_argument(
|
||
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
||
)
|
||
group.add_argument(
|
||
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
|
||
)
|
||
group.add_argument(
|
||
"-t",
|
||
"--titles",
|
||
action="store_true",
|
||
help="返回解析后的 markdown 文档的标题行(1-6级)",
|
||
)
|
||
group.add_argument(
|
||
"-tc",
|
||
"--title-content",
|
||
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
|
||
)
|
||
group.add_argument(
|
||
"-s",
|
||
"--search",
|
||
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 实例化所有 readers
|
||
readers = [ReaderCls() for ReaderCls in READERS]
|
||
|
||
# --advice 模式:仅显示建议,不解析
|
||
if args.advice:
|
||
advice = generate_advice(args.input_path, readers, "scripts/lyxy_document_reader.py")
|
||
if advice:
|
||
print(advice)
|
||
else:
|
||
print(f"错误: 无法识别文件类型: {args.input_path}")
|
||
sys.exit(1)
|
||
return
|
||
|
||
try:
|
||
content, failures = parse_input(args.input_path, readers)
|
||
except FileDetectionError as e:
|
||
print(f"错误: {e}")
|
||
sys.exit(1)
|
||
except ReaderNotFoundError as e:
|
||
print(f"错误: {e}")
|
||
sys.exit(1)
|
||
|
||
if content is None:
|
||
print("所有解析方法均失败:")
|
||
for failure in failures:
|
||
print(failure)
|
||
sys.exit(1)
|
||
|
||
# 处理内容
|
||
content = process_content(content)
|
||
|
||
# 输出结果
|
||
output_result(content, args)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|