- 从 __file__ 动态计算项目根目录 - 使用绝对路径引用 bootstrap.py - 设置正确的 PYTHONPATH 和 cwd - 添加路径解析测试 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
123 lines
3.5 KiB
Python
123 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
||
"""文档解析器入口 - 环境检测和自启动"""
|
||
|
||
import argparse
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 确定项目根目录和脚本路径
|
||
script_file = Path(__file__).resolve()
|
||
scripts_dir = script_file.parent
|
||
project_root = scripts_dir.parent
|
||
bootstrap_path = str(scripts_dir / "bootstrap.py")
|
||
|
||
# 将 scripts/ 目录添加到 sys.path
|
||
if str(scripts_dir) not in sys.path:
|
||
sys.path.append(str(scripts_dir))
|
||
|
||
# 抑制第三方库日志
|
||
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||
os.environ["TQDM_DISABLE"] = "1"
|
||
|
||
|
||
def main():
|
||
"""主函数:环境检测和决策"""
|
||
# 解析命令行参数(轻量,仅识别必要参数)
|
||
parser = argparse.ArgumentParser(
|
||
description="将 DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL 解析为 Markdown"
|
||
)
|
||
parser.add_argument("input_path", help="DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL")
|
||
parser.add_argument(
|
||
"-n",
|
||
"--context",
|
||
type=int,
|
||
default=2,
|
||
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
|
||
)
|
||
group = parser.add_mutually_exclusive_group()
|
||
group.add_argument(
|
||
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
|
||
)
|
||
group.add_argument(
|
||
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
|
||
)
|
||
group.add_argument(
|
||
"-t",
|
||
"--titles",
|
||
action="store_true",
|
||
help="返回解析后的 markdown 文档的标题行(1-6级)",
|
||
)
|
||
group.add_argument(
|
||
"-tc",
|
||
"--title-content",
|
||
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
|
||
)
|
||
group.add_argument(
|
||
"-s",
|
||
"--search",
|
||
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 检测 uv 是否可用
|
||
uv_path = shutil.which("uv")
|
||
|
||
if not uv_path:
|
||
# uv 不可用,降级为直接执行 bootstrap.py
|
||
import bootstrap
|
||
bootstrap.run_normal(args)
|
||
return
|
||
|
||
# uv 可用,需要自启动
|
||
# 导入依赖检测模块
|
||
from config import DEPENDENCIES
|
||
from core.advice_generator import (
|
||
detect_file_type_light,
|
||
get_platform,
|
||
get_dependencies,
|
||
generate_uv_args,
|
||
)
|
||
from readers import READERS
|
||
|
||
# 检测文件类型
|
||
readers = [ReaderCls() for ReaderCls in READERS]
|
||
reader_cls = detect_file_type_light(args.input_path, readers)
|
||
|
||
if not reader_cls:
|
||
# 无法识别文件类型,降级执行让它报错
|
||
import bootstrap
|
||
bootstrap.run_normal(args)
|
||
return
|
||
|
||
# 获取平台和依赖配置
|
||
platform_id = get_platform()
|
||
python_version, dependencies = get_dependencies(reader_cls, platform_id)
|
||
|
||
# 生成 uv 命令参数列表
|
||
uv_args = generate_uv_args(
|
||
dependencies=dependencies,
|
||
script_path=bootstrap_path,
|
||
python_version=python_version,
|
||
include_pyarmor=True
|
||
)
|
||
|
||
# 添加所有命令行参数
|
||
uv_args.extend(sys.argv[1:])
|
||
|
||
# 设置环境变量
|
||
env = os.environ.copy()
|
||
env["PYTHONPATH"] = str(project_root)
|
||
|
||
# 自启动:使用 subprocess 替代 execvpe(Windows 兼容)
|
||
result = subprocess.run(uv_args, env=env, cwd=str(project_root))
|
||
sys.exit(result.returncode)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|