- 删除 tests/test_readers/conftest.py 中重复的 temp_html fixture - 为 generate_uv_command/generate_python_command 添加 include_pyarmor 参数 - 新增 generate_uv_args 函数用于生成 subprocess 可用的参数列表 - lyxy_document_reader.py 复用 generate_uv_args 函数
297 lines
7.5 KiB
Python
297 lines
7.5 KiB
Python
"""建议生成器模块,根据文件类型和平台返回执行建议。"""
|
||
|
||
import platform
|
||
from pathlib import Path
|
||
from typing import Dict, Optional, Tuple, List, Type
|
||
|
||
from config import DEPENDENCIES
|
||
from readers import BaseReader
|
||
from readers import (
|
||
PdfReader,
|
||
DocxReader,
|
||
XlsxReader,
|
||
PptxReader,
|
||
HtmlReader,
|
||
XlsReader,
|
||
)
|
||
|
||
|
||
# Reader 类到配置 key 的映射
|
||
_READER_KEY_MAP: Dict[Type[BaseReader], str] = {
|
||
PdfReader: "pdf",
|
||
DocxReader: "docx",
|
||
XlsxReader: "xlsx",
|
||
PptxReader: "pptx",
|
||
HtmlReader: "html",
|
||
XlsReader: "xls",
|
||
}
|
||
|
||
|
||
def detect_file_type_light(input_path: str, readers: List[BaseReader]) -> Optional[Type[BaseReader]]:
|
||
"""
|
||
轻量文件类型检测,复用 Reader 的 supports 方法。
|
||
|
||
Args:
|
||
input_path: 文件路径或 URL
|
||
readers: 已实例化的 reader 列表
|
||
|
||
Returns:
|
||
支持该输入的 Reader 类,无法识别返回 None
|
||
"""
|
||
for reader in readers:
|
||
if reader.supports(input_path):
|
||
return reader.__class__
|
||
return None
|
||
|
||
|
||
def get_platform() -> str:
|
||
"""
|
||
获取当前平台标识,格式为 {system}-{machine}。
|
||
|
||
Returns:
|
||
平台标识,例如 "Darwin-arm64"、"Linux-x86_64"、"Windows-AMD64"
|
||
"""
|
||
system = platform.system()
|
||
machine = platform.machine()
|
||
return f"{system}-{machine}"
|
||
|
||
|
||
def get_dependencies(reader_cls: Type[BaseReader], platform_id: str) -> Tuple[Optional[str], list]:
|
||
"""
|
||
获取指定 Reader 类和平台的依赖配置。
|
||
|
||
Args:
|
||
reader_cls: Reader 类
|
||
platform_id: 平台标识(如 "Darwin-arm64")
|
||
|
||
Returns:
|
||
(python_version, dependencies) 元组
|
||
- python_version: 需要的 python 版本,None 表示使用默认
|
||
- dependencies: 依赖包列表
|
||
"""
|
||
key = _READER_KEY_MAP.get(reader_cls)
|
||
if not key or key not in DEPENDENCIES:
|
||
return None, []
|
||
|
||
type_config = DEPENDENCIES[key]
|
||
|
||
# 先尝试匹配特定平台
|
||
if platform_id in type_config:
|
||
config = type_config[platform_id]
|
||
return config.get("python"), config.get("dependencies", [])
|
||
|
||
# 使用 default 配置
|
||
if "default" in type_config:
|
||
config = type_config["default"]
|
||
return config.get("python"), config.get("dependencies", [])
|
||
|
||
return None, []
|
||
|
||
|
||
def generate_uv_command(
|
||
dependencies: list,
|
||
input_path: str,
|
||
python_version: Optional[str] = None,
|
||
script_path: str = "scripts/lyxy_document_reader.py",
|
||
include_pyarmor: bool = True
|
||
) -> str:
|
||
"""
|
||
生成 uv run 命令。
|
||
|
||
Args:
|
||
dependencies: 依赖包列表
|
||
input_path: 输入文件路径或 URL
|
||
python_version: 需要的 python 版本,None 表示不指定
|
||
script_path: 脚本路径
|
||
include_pyarmor: 是否包含 pyarmor 依赖
|
||
|
||
Returns:
|
||
uv run 命令字符串
|
||
"""
|
||
parts = ["PYTHONPATH=. uv run"]
|
||
|
||
if python_version:
|
||
parts.append(f"--python {python_version}")
|
||
|
||
if include_pyarmor:
|
||
parts.append("--with pyarmor")
|
||
|
||
for dep in dependencies:
|
||
# 处理包含空格的依赖(如 unstructured[pdf]),需要加引号
|
||
if "[" in dep or " " in dep:
|
||
parts.append(f'--with "{dep}"')
|
||
else:
|
||
parts.append(f"--with {dep}")
|
||
|
||
parts.append(f"{script_path} {input_path}")
|
||
|
||
return " ".join(parts)
|
||
|
||
|
||
def generate_uv_args(
|
||
dependencies: list,
|
||
script_path: str,
|
||
python_version: Optional[str] = None,
|
||
include_pyarmor: bool = True
|
||
) -> list:
|
||
"""
|
||
生成 uv run 命令参数列表(用于 subprocess.run)。
|
||
|
||
Args:
|
||
dependencies: 依赖包列表
|
||
script_path: 脚本路径
|
||
python_version: 需要的 python 版本,None 表示不指定
|
||
include_pyarmor: 是否包含 pyarmor 依赖
|
||
|
||
Returns:
|
||
uv run 命令参数列表
|
||
"""
|
||
args = ["uv", "run"]
|
||
|
||
if python_version:
|
||
args.extend(["--python", python_version])
|
||
|
||
if include_pyarmor:
|
||
args.extend(["--with", "pyarmor"])
|
||
|
||
for dep in dependencies:
|
||
args.extend(["--with", dep])
|
||
|
||
args.append(script_path)
|
||
|
||
return args
|
||
|
||
|
||
def generate_python_command(
|
||
dependencies: list,
|
||
input_path: str,
|
||
script_path: str = "scripts/lyxy_document_reader.py",
|
||
include_pyarmor: bool = True
|
||
) -> Tuple[str, str]:
|
||
"""
|
||
生成 python 命令和 pip 安装命令。
|
||
|
||
Args:
|
||
dependencies: 依赖包列表
|
||
input_path: 输入文件路径或 URL
|
||
script_path: 脚本路径
|
||
include_pyarmor: 是否包含 pyarmor 依赖
|
||
|
||
Returns:
|
||
(python_command, pip_command) 元组
|
||
"""
|
||
python_cmd = f"python {script_path} {input_path}"
|
||
|
||
# 构建 pip install 命令,处理带引号的依赖
|
||
pip_parts = ["pip install"]
|
||
if include_pyarmor:
|
||
pip_parts.append("pyarmor")
|
||
for dep in dependencies:
|
||
pip_parts.append(dep)
|
||
pip_cmd = " ".join(pip_parts)
|
||
|
||
return python_cmd, pip_cmd
|
||
|
||
|
||
def format_advice(
|
||
file_type: str,
|
||
input_path: str,
|
||
platform_id: str,
|
||
uv_command: str,
|
||
python_command: str,
|
||
pip_command: str,
|
||
has_platform_specific: bool = False
|
||
) -> str:
|
||
"""
|
||
格式化建议输出。
|
||
|
||
Args:
|
||
file_type: 文件类型
|
||
input_path: 输入路径
|
||
platform_id: 平台标识
|
||
uv_command: uv 命令
|
||
python_command: python 命令
|
||
pip_command: pip 安装命令
|
||
has_platform_specific: 是否使用了平台特殊配置
|
||
|
||
Returns:
|
||
格式化后的建议文本
|
||
"""
|
||
lines = []
|
||
|
||
# 文件类型和输入路径
|
||
lines.append(f"文件类型: {file_type.upper()}")
|
||
lines.append(f"输入路径: {input_path}")
|
||
|
||
# 平台信息(仅当使用了特殊配置时显示)
|
||
if has_platform_specific:
|
||
lines.append(f"平台: {platform_id}")
|
||
|
||
lines.append("")
|
||
|
||
# uv 命令
|
||
lines.append("[uv 命令]")
|
||
lines.append(uv_command)
|
||
lines.append("")
|
||
|
||
# python 命令
|
||
lines.append("[python 命令]")
|
||
lines.append(python_command)
|
||
lines.append(pip_command)
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def generate_advice(
|
||
input_path: str,
|
||
readers: List[BaseReader],
|
||
script_path: str = "scripts/lyxy_document_reader.py"
|
||
) -> Optional[str]:
|
||
"""
|
||
生成完整的执行建议。
|
||
|
||
Args:
|
||
input_path: 输入文件路径或 URL
|
||
readers: 已实例化的 reader 列表
|
||
script_path: 脚本路径
|
||
|
||
Returns:
|
||
格式化的建议文本,无法识别文件类型返回 None
|
||
"""
|
||
# 检测文件类型,获取 Reader 类
|
||
reader_cls = detect_file_type_light(input_path, readers)
|
||
if not reader_cls:
|
||
return None
|
||
|
||
# 获取配置 key 和显示名称
|
||
key = _READER_KEY_MAP.get(reader_cls, "unknown")
|
||
file_type = key
|
||
|
||
# 获取平台
|
||
platform_id = get_platform()
|
||
|
||
# 获取依赖配置
|
||
python_version, dependencies = get_dependencies(reader_cls, platform_id)
|
||
|
||
# 判断是否使用了平台特殊配置
|
||
has_platform_specific = False
|
||
if key in DEPENDENCIES:
|
||
type_config = DEPENDENCIES[key]
|
||
if platform_id in type_config and "default" in type_config:
|
||
has_platform_specific = True
|
||
|
||
# 生成命令
|
||
uv_command = generate_uv_command(dependencies, input_path, python_version, script_path)
|
||
python_command, pip_command = generate_python_command(dependencies, input_path, script_path)
|
||
|
||
# 格式化输出
|
||
return format_advice(
|
||
file_type,
|
||
input_path,
|
||
platform_id,
|
||
uv_command,
|
||
python_command,
|
||
pip_command,
|
||
has_platform_specific
|
||
)
|