Files
lyxy-document/scripts/lyxy_document_reader.py
lanyuanxiaoyao a5c0b67360 refactor: 简化代码,消除重复逻辑
- 删除 tests/test_readers/conftest.py 中重复的 temp_html fixture
- 为 generate_uv_command/generate_python_command 添加 include_pyarmor 参数
- 新增 generate_uv_args 函数用于生成 subprocess 可用的参数列表
- lyxy_document_reader.py 复用 generate_uv_args 函数
2026-03-15 10:28:04 +08:00

118 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""文档解析器入口 - 环境检测和自启动"""
import argparse
import os
import shutil
import subprocess
import sys
from pathlib import Path
# 将 scripts/ 目录添加到 sys.path
scripts_dir = Path(__file__).resolve().parent
if str(scripts_dir) not in sys.path:
sys.path.append(str(scripts_dir))
# 抑制第三方库日志
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TQDM_DISABLE"] = "1"
def main():
"""主函数:环境检测和决策"""
# 解析命令行参数(轻量,仅识别必要参数)
parser = argparse.ArgumentParser(
description="将 DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL 解析为 Markdown"
)
parser.add_argument("input_path", help="DOCX、XLS、XLSX、PPTX、PDF、HTML 文件或 URL")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
# 检测 uv 是否可用
uv_path = shutil.which("uv")
if not uv_path:
# uv 不可用,降级为直接执行 bootstrap.py
import bootstrap
bootstrap.run_normal(args)
return
# uv 可用,需要自启动
# 导入依赖检测模块
from config import DEPENDENCIES
from core.advice_generator import (
detect_file_type_light,
get_platform,
get_dependencies,
generate_uv_args,
)
from readers import READERS
# 检测文件类型
readers = [ReaderCls() for ReaderCls in READERS]
reader_cls = detect_file_type_light(args.input_path, readers)
if not reader_cls:
# 无法识别文件类型,降级执行让它报错
import bootstrap
bootstrap.run_normal(args)
return
# 获取平台和依赖配置
platform_id = get_platform()
python_version, dependencies = get_dependencies(reader_cls, platform_id)
# 生成 uv 命令参数列表
uv_args = generate_uv_args(
dependencies=dependencies,
script_path="scripts/bootstrap.py",
python_version=python_version,
include_pyarmor=True
)
# 添加所有命令行参数
uv_args.extend(sys.argv[1:])
# 设置环境变量
env = os.environ.copy()
env["PYTHONPATH"] = "."
# 自启动:使用 subprocess 替代 execvpeWindows 兼容)
result = subprocess.run(uv_args, env=env)
sys.exit(result.returncode)
if __name__ == "__main__":
main()