1、拉取镜像:
docker pull nvcr.io/nvidia/pytorch:23.10-py3
此镜像需要自己安装vllm,也可拉取官方镜像,但只可开启类openai api服务
docker pull vllm/vllm-openai:latest
docker run --runtime nvidia --gpus all --name vllm \
-v /mount/nfs/Dataset/ql/model:/root/model \
-v /mount/nfs/Dataset/ql/vllm:/root/vllm \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-p 8880:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model /root/model/Qwen/qwen1.5-14b-chat-gptq-int4 \
--gpu-memory-utilization 0.8 \
--tensor-parallel-size 2 \
--max-model-len 8129 \
--served-model-name Qwen1.5-14B-Chat
2、安装环境
docker run -id --runtime nvidia --gpus all --name vllm \
-v /mount/nfs/Dataset/ql/model:/root/model \
-v /mount/nfs/Dataset/ql/vllm:/root/vllm \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-p 8880:8000 \
--ipc=host \
nvcr.io/nvidia/pytorch:23.10-py3
docker exec -it vllm /bin/bash
pip install vllm -i https://pypi.tuna.tsinghua.edu.cn/simple
3、部署服务
部署api服务:
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model /root/model/Qwen/qwen1.5-14b-chat-gptq-int4
curl http://localhost:8000/generate \
-d '{
"prompt": "San Francisco is a",
"use_beam_search": true,
"n": 4,
"temperature": 0
}'
部署openai风格的 api服务
CUDA_VISIBLE_DEVICES=6,7 python -m vllm.entrypoints.openai.api_server \
--model /root/model/Qwen/qwen1.5-14b-chat-gptq-int4 \
--served-model-name qwen1.5-14b-chat-int4
--gpu-memory-utilization 0.8 \
--tensor-parallel-size 2
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-2-13b-chat-hf",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'