|
|
|
|
@@ -0,0 +1,251 @@
|
|
|
|
|
package com.lanyuanxiaoyao.service.ai.cli;
|
|
|
|
|
|
|
|
|
|
import cn.hutool.core.collection.ListUtil;
|
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author lanyuanxiaoyao
|
|
|
|
|
* @version 20250612
|
|
|
|
|
*/
|
|
|
|
|
public class LlamaSwapTool extends GeneratorTool {
|
|
|
|
|
private static final String API_KEY = "*XMySqV%>hR&v>>g*NwCs3tpQ5FVMFEF2VHVTj<MYQd$&@$sY7CgqNyea4giJi4";
|
|
|
|
|
|
|
|
|
|
public static String displayName(String name) {
|
|
|
|
|
return name.replaceAll("\\s+", "_")
|
|
|
|
|
.replaceAll("\\.", "_")
|
|
|
|
|
.toLowerCase();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void generate() {
|
|
|
|
|
generateTemplate(
|
|
|
|
|
"llama-swap.ftl",
|
|
|
|
|
Map.of(
|
|
|
|
|
"models", List.of(
|
|
|
|
|
llamaCppEmbeddingCmd("BGE/bge-m3-q4km", "bge-m3-Q4_K_M.gguf", 20),
|
|
|
|
|
vllmEmbeddingCmd("BGE/bge-m3", "bge-m3", 20, 5),
|
|
|
|
|
llamaCppRerankerCmd("BGE/beg-reranker-v2-q4km", "bge-reranker-v2-m3-Q4_K_M.gguf", 20),
|
|
|
|
|
llamaCppRerankerCmd("BGE/beg-reranker-v2", "bge-reranker-v2-m3", 20),
|
|
|
|
|
|
|
|
|
|
vllmCmd("Qwen3/qwen3-0.6b", "Qwen3-0.6B", 35, 5, true),
|
|
|
|
|
vllmCmd("Qwen3/qwen3-1.7b", "Qwen3-1.7B", 35, 5, true),
|
|
|
|
|
vllmCmd("Qwen3/qwen3-4b", "Qwen3-4B", 35, 8, true),
|
|
|
|
|
llamaCppCmd("Qwen3/qwen3-4b-q4km", "Qwen3-4B-Q4_K_M.gguf", 35),
|
|
|
|
|
llamaCppCmd("Qwen3/qwen3-8b-q4km", "Qwen3-8B-Q4_K_M.gguf", 35),
|
|
|
|
|
|
|
|
|
|
vllmEmbeddingCmd("Qwen3/qwen3-embedding-0.6b", "Qwen3-Embedding-0.6B", 35, 5),
|
|
|
|
|
vllmEmbeddingCmd("Qwen3/qwen3-embedding-4b", "Qwen3-Embedding-4B", 35, 8),
|
|
|
|
|
llamaCppEmbeddingCmd("Qwen3/qwen3-embedding-4b-q4km", "Qwen3-Embedding-4B-Q4_K_M.gguf", 35),
|
|
|
|
|
llamaCppEmbeddingCmd("Qwen3/qwen3-embedding-8b-q4km", "Qwen3-Embedding-8B-Q4_K_M.gguf", 35),
|
|
|
|
|
|
|
|
|
|
vllmRerankerCmd("Qwen3/qwen3-reranker-0.6b", "Qwen3-Reranker-0.6B", 35, 5),
|
|
|
|
|
vllmRerankerCmd("Qwen3/qwen3-reranker-4b", "Qwen3-Reranker-4B", 35, 8),
|
|
|
|
|
|
|
|
|
|
llamaCppVisualCmd("Qwen2.5/qwen2.5-vl-7b", "Qwen2.5-VL-7B-Instruct-BF16.gguf", 35),
|
|
|
|
|
llamaCppVisualCmd("Qwen2.5/qwen2.5-vl-7b-q4km", "Qwen2.5-VL-7B-Instruct-Q4_K_M.gguf", 35),
|
|
|
|
|
vllmCmd("Qwen2.5/qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 35, 8, false),
|
|
|
|
|
vllmCmd("Qwen2.5/qwen2.5-vl-7b-instruct-awq", "Qwen2.5-VL-7B-Instruct-AWQ", 35, 8, false),
|
|
|
|
|
vllmCmd("Qwen2.5/qwen2.5-vl-3b-instruct", "Qwen2.5-VL-3B-Instruct", 35, 8, false),
|
|
|
|
|
vllmCmd("Qwen2.5/qwen2.5-vl-3b-instruct-awq", "Qwen2.5-VL-3B-Instruct-AWQ", 35, 8, false),
|
|
|
|
|
|
|
|
|
|
llamaCppVisualCmd("MiniCPM/minicpm-o-2.6-7.6b-q4km", "MiniCPM-o-2_6-7.6B-Q4_K_M.gguf", 35),
|
|
|
|
|
vllmCmd("MiniCPM/minicpm-o-2.6-7.6b", "MiniCPM-o-2_6", 35, 10, false),
|
|
|
|
|
|
|
|
|
|
vllmEmbeddingCmd("MiniCPM/minicpm-embedding", "MiniCPM-Embedding", 20, 8),
|
|
|
|
|
vllmEmbeddingCmd("MiniCPM/minicpm-embedding-light", "MiniCPM-Embedding-Light", 20, 5),
|
|
|
|
|
|
|
|
|
|
vllmEmbeddingCmd("MiniCPM/minicpm-reranker", "MiniCPM-Reranker", 20, 8),
|
|
|
|
|
vllmEmbeddingCmd("MiniCPM/minicpm-reranker-light", "MiniCPM-Reranker-Light", 20, 5)
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
"config.yaml"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd llamaCppCmd(String name, String model, Integer thread) {
|
|
|
|
|
return llamaCppCmd(name, model, thread, false, false, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd llamaCppEmbeddingCmd(String name, String model, Integer thread) {
|
|
|
|
|
return llamaCppCmd(name, model, thread, true, false, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd llamaCppRerankerCmd(String name, String model, Integer thread) {
|
|
|
|
|
return llamaCppCmd(name, model, thread, false, true, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd llamaCppVisualCmd(String name, String model, Integer thread) {
|
|
|
|
|
return llamaCppCmd(name, model, thread, false, false, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd llamaCppCmd(String name, String model, Integer thread, Boolean isEmbedding, Boolean isReranker, Boolean isVisual) {
|
|
|
|
|
List<String> arguments = ListUtil.list(
|
|
|
|
|
false,
|
|
|
|
|
StrUtil.format("-m /models/{}", model),
|
|
|
|
|
"--port ${PORT}",
|
|
|
|
|
StrUtil.format("--api-key {}", API_KEY),
|
|
|
|
|
"-c 0",
|
|
|
|
|
"-b 4096",
|
|
|
|
|
StrUtil.format("-t {}", thread),
|
|
|
|
|
"-np 5",
|
|
|
|
|
"--log-disable",
|
|
|
|
|
"--no-webui"
|
|
|
|
|
);
|
|
|
|
|
if (isEmbedding) {
|
|
|
|
|
arguments.add("--embedding");
|
|
|
|
|
arguments.add("-ub 8192");
|
|
|
|
|
arguments.add("--pooling mean");
|
|
|
|
|
} else if (isReranker) {
|
|
|
|
|
arguments.add("--reranking");
|
|
|
|
|
} else if (isVisual) {
|
|
|
|
|
arguments.add(StrUtil.format("--mmproj /models/{}.mmproj", model));
|
|
|
|
|
} else {
|
|
|
|
|
arguments.add("--jinja");
|
|
|
|
|
}
|
|
|
|
|
return new DockerCmd(
|
|
|
|
|
"ghcr.io/ggml-org/llama.cpp:server",
|
|
|
|
|
name,
|
|
|
|
|
model,
|
|
|
|
|
StrUtil.format("http://llamacpp-{}:${PORT}", displayName(model)),
|
|
|
|
|
List.of(StrUtil.format("--name llamacpp-{}", displayName(model))),
|
|
|
|
|
arguments
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd vllmCmd(String name, String model, Integer thread, Integer cache, Boolean isReasonable) {
|
|
|
|
|
return vllmCmd(name, model, thread, cache, false, false, isReasonable, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd vllmEmbeddingCmd(String name, String model, Integer thread, Integer cache) {
|
|
|
|
|
return vllmCmd(name, model, thread, cache, true, false, false, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd vllmRerankerCmd(String name, String model, Integer thread, Integer cache) {
|
|
|
|
|
return vllmCmd(name, model, thread, cache, false, true, false, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd vllmVisualCmd(String name, String model, Integer thread, Integer cache, Boolean isReasonable) {
|
|
|
|
|
return vllmCmd(name, model, thread, cache, false, false, isReasonable, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private DockerCmd vllmCmd(String name, String model, Integer thread, Integer cache, Boolean isEmbedding, Boolean isReranker, Boolean isReasonable, Boolean isVisual) {
|
|
|
|
|
List<String> arguments = ListUtil.list(
|
|
|
|
|
false,
|
|
|
|
|
StrUtil.format("--model /models/{}", model),
|
|
|
|
|
StrUtil.format("--served-model-name {}", name),
|
|
|
|
|
"--port ${PORT}",
|
|
|
|
|
StrUtil.format("--api-key {}", API_KEY),
|
|
|
|
|
"--disable-log-requests",
|
|
|
|
|
"--uvicorn-log-level error"
|
|
|
|
|
);
|
|
|
|
|
if (isEmbedding) {
|
|
|
|
|
arguments.add("--task embedding");
|
|
|
|
|
} else if (isReranker) {
|
|
|
|
|
} else if (isReasonable) {
|
|
|
|
|
arguments.add("--enable-auto-tool-choice");
|
|
|
|
|
arguments.add("--tool-call-parser hermes");
|
|
|
|
|
arguments.add("--enable-reasoning");
|
|
|
|
|
arguments.add("--reasoning-parser deepseek_r1");
|
|
|
|
|
}
|
|
|
|
|
return new DockerCmd(
|
|
|
|
|
"vllm-server-cpu:0.9.1",
|
|
|
|
|
name,
|
|
|
|
|
model,
|
|
|
|
|
StrUtil.format("http://vllm-{}:${PORT}", displayName(model)),
|
|
|
|
|
List.of(
|
|
|
|
|
StrUtil.format("--name vllm-{}", displayName(model)),
|
|
|
|
|
"--privileged=true",
|
|
|
|
|
"--shm-size=4g",
|
|
|
|
|
StrUtil.format("-e VLLM_CPU_KVCACHE_SPACE={}", cache),
|
|
|
|
|
StrUtil.format("-e VLLM_CPU_OMP_THREADS_BIND=0-{}", thread - 1),
|
|
|
|
|
"-e VLLM_CPU_MOE_PREPACK=0"
|
|
|
|
|
),
|
|
|
|
|
arguments
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static class DockerCmd {
|
|
|
|
|
private String image;
|
|
|
|
|
private String name;
|
|
|
|
|
private String model;
|
|
|
|
|
private String proxy;
|
|
|
|
|
private List<String> options = ListUtil.list(
|
|
|
|
|
false,
|
|
|
|
|
"--rm",
|
|
|
|
|
"--network llama",
|
|
|
|
|
"-v /data/models:/models"
|
|
|
|
|
);
|
|
|
|
|
private List<String> arguments = ListUtil.list(false);
|
|
|
|
|
|
|
|
|
|
public DockerCmd(String image, String name, String model, String proxy, List<String> options, List<String> arguments) {
|
|
|
|
|
this.image = image;
|
|
|
|
|
this.name = name;
|
|
|
|
|
this.model = model;
|
|
|
|
|
this.proxy = proxy;
|
|
|
|
|
this.options.addAll(options);
|
|
|
|
|
this.arguments.addAll(arguments);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getImage() {
|
|
|
|
|
return image;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setImage(String image) {
|
|
|
|
|
this.image = image;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getName() {
|
|
|
|
|
return name;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setName(String name) {
|
|
|
|
|
this.name = name;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getModel() {
|
|
|
|
|
return model;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setModel(String model) {
|
|
|
|
|
this.model = model;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getProxy() {
|
|
|
|
|
return proxy;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setProxy(String proxy) {
|
|
|
|
|
this.proxy = proxy;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<String> getOptions() {
|
|
|
|
|
return options;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setOptions(List<String> options) {
|
|
|
|
|
this.options = options;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<String> getArguments() {
|
|
|
|
|
return arguments;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setArguments(List<String> arguments) {
|
|
|
|
|
this.arguments = arguments;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
|
|
|
|
return "DockerCmd{" +
|
|
|
|
|
"image='" + image + '\'' +
|
|
|
|
|
", name='" + name + '\'' +
|
|
|
|
|
", model='" + model + '\'' +
|
|
|
|
|
", proxy='" + proxy + '\'' +
|
|
|
|
|
", options=" + options +
|
|
|
|
|
", arguments=" + arguments +
|
|
|
|
|
'}';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|