Files
lyxy-document/scripts/lyxy_document_reader.py
lanyuanxiaoyao edbdeec90d fix: 支持从任意路径调用 lyxy_document_reader.py
- 从 __file__ 动态计算项目根目录
- 使用绝对路径引用 bootstrap.py
- 设置正确的 PYTHONPATH 和 cwd
- 添加路径解析测试

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-15 12:06:44 +08:00

123 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""文档解析器入口 - 环境检测和自启动"""
import argparse
import os
import shutil
import subprocess
import sys
from pathlib import Path
# 确定项目根目录和脚本路径
script_file = Path(__file__).resolve()
scripts_dir = script_file.parent
project_root = scripts_dir.parent
bootstrap_path = str(scripts_dir / "bootstrap.py")
# 将 scripts/ 目录添加到 sys.path
if str(scripts_dir) not in sys.path:
sys.path.append(str(scripts_dir))
# 抑制第三方库日志
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TQDM_DISABLE"] = "1"
def main():
"""主函数:环境检测和决策"""
# 解析命令行参数(轻量,仅识别必要参数)
parser = argparse.ArgumentParser(
description="将 DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL 解析为 Markdown"
)
parser.add_argument("input_path", help="DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
# 检测 uv 是否可用
uv_path = shutil.which("uv")
if not uv_path:
# uv 不可用,降级为直接执行 bootstrap.py
import bootstrap
bootstrap.run_normal(args)
return
# uv 可用,需要自启动
# 导入依赖检测模块
from config import DEPENDENCIES
from core.advice_generator import (
detect_file_type_light,
get_platform,
get_dependencies,
generate_uv_args,
)
from readers import READERS
# 检测文件类型
readers = [ReaderCls() for ReaderCls in READERS]
reader_cls = detect_file_type_light(args.input_path, readers)
if not reader_cls:
# 无法识别文件类型,降级执行让它报错
import bootstrap
bootstrap.run_normal(args)
return
# 获取平台和依赖配置
platform_id = get_platform()
python_version, dependencies = get_dependencies(reader_cls, platform_id)
# 生成 uv 命令参数列表
uv_args = generate_uv_args(
dependencies=dependencies,
script_path=bootstrap_path,
python_version=python_version,
include_pyarmor=True
)
# 添加所有命令行参数
uv_args.extend(sys.argv[1:])
# 设置环境变量
env = os.environ.copy()
env["PYTHONPATH"] = str(project_root)
# 自启动:使用 subprocess 替代 execvpeWindows 兼容)
result = subprocess.run(uv_args, env=env, cwd=str(project_root))
sys.exit(result.returncode)
if __name__ == "__main__":
main()