众所周知vllm部署推理有两种方式:
- 通过命令行部署http调用服务。
- 直接调用vllm的LLM类实现推理
第一种http方式:
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.9 --max-model-len 29856 --served-model-name Qwen1.5-72B-Chat --model /home/zhongxingyu/Qwen1.5-72B-Chat --tensor-parallel-size 4
调用直接用http请求
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "Qwen1.5-72B-Chat","messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Tell me something about large language models."}]}'
如果需要关闭占用的gpu资源直接关掉这个python进程就可以了。
第二种调用vllm的LLM类方式:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from logger.logging_config import logger
class VllmRun:
def __init__(self,model_path:str,tensor_parallel_size:int = 4,gpu_memory_utilization:float =0.95):
logger.info(f"vllm init with model_path: {model_path}, tensor_parallel_size: {tensor_parallel_size}, gpu_memory_utilization: {gpu_memory_utilization}")
self.llm = LLM(model=model_path,tensor_parallel_size=tensor_parallel_size,gpu_memory_utilization=gpu_memory_utilization)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.stop_words_ids=[self.tokenizer.eos_token_id,self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
def close(self):
import gc
import torch
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
import os
#avoid huggingface/tokenizers process dead lock
os.environ["TOKENIZERS_PARALLELISM"] = "false"
destroy_model_parallel()
#del a vllm.executor.ray_gpu_executor.RayGPUExecutor object
del self.llm.llm_engine.model_executor
del self.llm
gc.collect()
torch.cuda.empty_cache()
import ray
ray.shutdown()
def single_chat(self,prompt:str,temperature:float=0.3, top_p:float=0.8, repetition_penalty:float=1.05, max_tokens:int=2048,instruct:str="you are helpful assistants"):
self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, max_tokens=max_tokens,stop_token_ids=self.stop_words_ids,skip_special_tokens=True)
messages = [
{"role": "system", "content": instruct},
{"role": "user", "content": prompt}
]
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
outputs = self.llm.generate(text, self.sampling_params)
return outputs
这里要代码释放gpu资源就需要实现close函数去手动关闭了,官方vllm目前没有实现关闭的函数,且目前vllm版本为vllm==0.4.0.post1