文章目录
一、安装
conda create -n e39 python=3.9
conda activate e39
pip install vllm
二、服务
1、vllm serve
vllm serve Qwen/Qwen2.5-1.5B-Instruct
用了 20G
2、Python 服务
from vllm import LLM, SamplingParams
# llm = LLM(model="facebook/opt-125m")
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
三、访问
curl - 查看所有模型
curl http://10.0.1.23:8000/v1/models
{
"object": "list",
"data": [{
"id": "Qwen/Qwen2.5-1.5B-Instruct",
"object": "model",
"created": 1741263998,
"owned_by": "vllm",
"root": "Qwen/Qwen2.5-1.5B-Instruct",
"parent": null,
"max_model_len": 32768,
"permission": [{
"id": "modelperm-79dc42186c2f46d085c3c98615f71e47",
"object": "model_permission",
"created": 1741263998,
"allow_create_engine": false,
"allow_sampling": true,
"allow_logprobs": true,
"allow_search_indices": false,
"allow_view": true,
"allow_fine_tuning": false,
"organization": "*",
"group": null,
"is_blocking": false
}]
}]
}
curl - completions
curl http://10.0.1.23:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-1.5B-Instruct",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'
{
"id": "cmpl-3fba4fc307e04e3f8d656049437be215",
"object": "text_completion",
"created": 1741264121,
"model": "Qwen/Qwen2.5-1.5B-Instruct",
"choices": [{
"index": 0,
"text": " city in the state of California,",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null,
"prompt_logprobs": null
}],
"usage": {
"prompt_tokens": 4,
"total_tokens": 11,
"completion_tokens": 7,
"prompt_tokens_details": null
}
}
curl - chat/completions
curl http://10.0.1.23:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-1.5B-Instruct",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"}
]
}'
{
"id": "chatcmpl-eaa9d73518ae4900ac3ca03445b94856",
"object": "chat.completion",
"created": 1741264227,
"model": "Qwen/Qwen2.5-1.5B-Instruct",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"reasoning_content": null,
"content": "The World Series in 2020 was played between the New York Yankees and the Boston Red Sox. The Yankees won in seven games, defeating the Red Sox across the regular and季后赛 (playoffs) seasons.",
"tool_calls": []
},
"logprobs": null,
"finish_reason": "stop",
"stop_reason": null
}],
"usage": {
"prompt_tokens": 31,
"total_tokens": 76,
"completion_tokens": 45,
"prompt_tokens_details": null
},
"prompt_logprobs": null
}
Python - completions
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://10.0.1.23:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a")
print("Completion result:", completion)
Completion result: Completion(
id='cmpl-accdc1011da548a08ac8f297146e7791',
choices=[
CompletionChoice(
finish_reason='length', index=0, logprobs=None,
text=' great place with a lot of rich history. But in recent years, the people',
stop_reason=None, prompt_logprobs=None)
],
created=1741264345, model='Qwen/Qwen2.5-1.5B-Instruct',
object='text_completion', system_fingerprint=None,
usage=CompletionUsage(
completion_tokens=16, prompt_tokens=4,
total_tokens=20, completion_tokens_details=None,
prompt_tokens_details=None))
Python - chat.completions
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://10.0.1.23:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
chat_response = client.chat.completions.create(
model="Qwen/Qwen2.5-1.5B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
]
)
print("Chat response:", chat_response)
Chat response: ChatCompletion(
id='chatcmpl-ae641c01fc54450b9a02bba0260c445b',
choices=[Choice(
finish_reason='stop', index=0, logprobs=None,
message=ChatCompletionMessage(
content='Why could the statue of liberty sleep 8 hours a day?\n\nBecause she had a full moon each month.',
refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=None)],
created=1741264394, model='Qwen/Qwen2.5-1.5B-Instruct',
object='chat.completion', service_tier=None, system_fingerprint=None,
usage=CompletionUsage(
completion_tokens=23, prompt_tokens=24, total_tokens=47,
completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)
2025-03-06(四)