diff --git a/service-ai/service-ai-cli/src/main/java/com/lanyuanxiaoyao/service/ai/cli/LlamaSwapTool.java b/service-ai/service-ai-cli/src/main/java/com/lanyuanxiaoyao/service/ai/cli/LlamaSwapTool.java index 4587932..5114242 100644 --- a/service-ai/service-ai-cli/src/main/java/com/lanyuanxiaoyao/service/ai/cli/LlamaSwapTool.java +++ b/service-ai/service-ai-cli/src/main/java/com/lanyuanxiaoyao/service/ai/cli/LlamaSwapTool.java @@ -25,39 +25,32 @@ public class LlamaSwapTool extends GeneratorTool { Map.of( "models", List.of( llamaCppEmbeddingCmd("BGE/bge-m3-q4km", "bge-m3-Q4_K_M.gguf", 20), - vllmEmbeddingCmd("BGE/bge-m3", "bge-m3", 20, 5), + vllmEmbeddingCmd("BGE/bge-m3", "bge-m3", 5), llamaCppRerankerCmd("BGE/beg-reranker-v2-q4km", "bge-reranker-v2-m3-Q4_K_M.gguf", 20), - llamaCppRerankerCmd("BGE/beg-reranker-v2", "bge-reranker-v2-m3", 20), + vllmRerankerCmd("BGE/beg-reranker-v2", "bge-reranker-v2-m3", 5), - vllmCmd("Qwen3/qwen3-0.6b", "Qwen3-0.6B", 35, 5, true), - vllmCmd("Qwen3/qwen3-1.7b", "Qwen3-1.7B", 35, 5, true), - vllmCmd("Qwen3/qwen3-4b", "Qwen3-4B", 35, 8, true), + vllmCmd("Qwen3/qwen3-0.6b", "Qwen3-0.6B", 5, true), + vllmCmd("Qwen3/qwen3-1.7b", "Qwen3-1.7B", 5, true), + vllmCmd("Qwen3/qwen3-4b", "Qwen3-4B", 8, true), llamaCppCmd("Qwen3/qwen3-4b-q4km", "Qwen3-4B-Q4_K_M.gguf", 35), llamaCppCmd("Qwen3/qwen3-8b-q4km", "Qwen3-8B-Q4_K_M.gguf", 35), - vllmEmbeddingCmd("Qwen3/qwen3-embedding-0.6b", "Qwen3-Embedding-0.6B", 35, 5), - vllmEmbeddingCmd("Qwen3/qwen3-embedding-4b", "Qwen3-Embedding-4B", 35, 8), + vllmEmbeddingCmd("Qwen3/qwen3-embedding-0.6b", "Qwen3-Embedding-0.6B", 5), + vllmEmbeddingCmd("Qwen3/qwen3-embedding-4b", "Qwen3-Embedding-4B", 8), llamaCppEmbeddingCmd("Qwen3/qwen3-embedding-4b-q4km", "Qwen3-Embedding-4B-Q4_K_M.gguf", 35), llamaCppEmbeddingCmd("Qwen3/qwen3-embedding-8b-q4km", "Qwen3-Embedding-8B-Q4_K_M.gguf", 35), - vllmRerankerCmd("Qwen3/qwen3-reranker-0.6b", "Qwen3-Reranker-0.6B", 35, 5), - vllmRerankerCmd("Qwen3/qwen3-reranker-4b", "Qwen3-Reranker-4B", 35, 8), + // 0.9.1 vllm还未支持 + // vllmRerankerCmd("Qwen3/qwen3-reranker-0.6b", "Qwen3-Reranker-0.6B", 5), + // vllmRerankerCmd("Qwen3/qwen3-reranker-4b", "Qwen3-Reranker-4B", 8), llamaCppVisualCmd("Qwen2.5/qwen2.5-vl-7b", "Qwen2.5-VL-7B-Instruct-BF16.gguf", 35), llamaCppVisualCmd("Qwen2.5/qwen2.5-vl-7b-q4km", "Qwen2.5-VL-7B-Instruct-Q4_K_M.gguf", 35), - vllmCmd("Qwen2.5/qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 35, 8, false), - vllmCmd("Qwen2.5/qwen2.5-vl-7b-instruct-awq", "Qwen2.5-VL-7B-Instruct-AWQ", 35, 8, false), - vllmCmd("Qwen2.5/qwen2.5-vl-3b-instruct", "Qwen2.5-VL-3B-Instruct", 35, 8, false), - vllmCmd("Qwen2.5/qwen2.5-vl-3b-instruct-awq", "Qwen2.5-VL-3B-Instruct-AWQ", 35, 8, false), + vllmCmd("Qwen2.5/qwen2.5-vl-3b-instruct", "Qwen2.5-VL-3B-Instruct", 8, false), + vllmCmd("Qwen2.5/qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 8, false), llamaCppVisualCmd("MiniCPM/minicpm-o-2.6-7.6b-q4km", "MiniCPM-o-2_6-7.6B-Q4_K_M.gguf", 35), - vllmCmd("MiniCPM/minicpm-o-2.6-7.6b", "MiniCPM-o-2_6", 35, 10, false), - - vllmEmbeddingCmd("MiniCPM/minicpm-embedding", "MiniCPM-Embedding", 20, 8), - vllmEmbeddingCmd("MiniCPM/minicpm-embedding-light", "MiniCPM-Embedding-Light", 20, 5), - - vllmEmbeddingCmd("MiniCPM/minicpm-reranker", "MiniCPM-Reranker", 20, 8), - vllmEmbeddingCmd("MiniCPM/minicpm-reranker-light", "MiniCPM-Reranker-Light", 20, 5) + vllmCmd("MiniCPM/minicpm-o-2.6-7.6b", "MiniCPM-o-2_6", 10, false) ) ), "config.yaml" @@ -114,23 +107,23 @@ public class LlamaSwapTool extends GeneratorTool { ); } - private DockerCmd vllmCmd(String name, String model, Integer thread, Integer cache, Boolean isReasonable) { - return vllmCmd(name, model, thread, cache, false, false, isReasonable, false); + private DockerCmd vllmCmd(String name, String model, Integer cache, Boolean isReasonable) { + return vllmCmd(name, model, cache, false, false, isReasonable); } - private DockerCmd vllmEmbeddingCmd(String name, String model, Integer thread, Integer cache) { - return vllmCmd(name, model, thread, cache, true, false, false, false); + private DockerCmd vllmEmbeddingCmd(String name, String model, Integer cache) { + return vllmCmd(name, model, cache, true, false, false); } - private DockerCmd vllmRerankerCmd(String name, String model, Integer thread, Integer cache) { - return vllmCmd(name, model, thread, cache, false, true, false, false); + private DockerCmd vllmRerankerCmd(String name, String model, Integer cache) { + return vllmCmd(name, model, cache, false, true, false); } - private DockerCmd vllmVisualCmd(String name, String model, Integer thread, Integer cache, Boolean isReasonable) { - return vllmCmd(name, model, thread, cache, false, false, isReasonable, true); + private DockerCmd vllmVisualCmd(String name, String model, Integer cache, Boolean isReasonable) { + return vllmCmd(name, model, cache, false, false, isReasonable); } - private DockerCmd vllmCmd(String name, String model, Integer thread, Integer cache, Boolean isEmbedding, Boolean isReranker, Boolean isReasonable, Boolean isVisual) { + private DockerCmd vllmCmd(String name, String model, Integer cache, Boolean isEmbedding, Boolean isReranker, Boolean isReasonable) { List arguments = ListUtil.list( false, StrUtil.format("--model /models/{}", model), @@ -138,11 +131,13 @@ public class LlamaSwapTool extends GeneratorTool { "--port ${PORT}", StrUtil.format("--api-key {}", API_KEY), "--disable-log-requests", - "--uvicorn-log-level error" + "--uvicorn-log-level error", + "--trust-remote-code" ); if (isEmbedding) { arguments.add("--task embedding"); } else if (isReranker) { + arguments.add("--task score"); } else if (isReasonable) { arguments.add("--enable-auto-tool-choice"); arguments.add("--tool-call-parser hermes"); @@ -158,9 +153,7 @@ public class LlamaSwapTool extends GeneratorTool { StrUtil.format("--name vllm-{}", displayName(model)), "--privileged=true", "--shm-size=4g", - StrUtil.format("-e VLLM_CPU_KVCACHE_SPACE={}", cache), - StrUtil.format("-e VLLM_CPU_OMP_THREADS_BIND=0-{}", thread - 1), - "-e VLLM_CPU_MOE_PREPACK=0" + StrUtil.format("-e VLLM_CPU_KVCACHE_SPACE={}", cache) ), arguments ); diff --git a/service-ai/service-ai-cli/src/main/resources/template/llama-swap.ftl b/service-ai/service-ai-cli/src/main/resources/template/llama-swap.ftl index 6a0bd27..612e573 100644 --- a/service-ai/service-ai-cli/src/main/resources/template/llama-swap.ftl +++ b/service-ai/service-ai-cli/src/main/resources/template/llama-swap.ftl @@ -1,4 +1,4 @@ -healthCheckTimeout: 120 +healthCheckTimeout: 600 logLevel: warn models: <#list models as model>