vLLM - 基本使用

最新推荐文章于 2025-04-05 00:42:55 发布

云客Coder

最新推荐文章于 2025-04-05 00:42:55 发布

阅读量422

点赞数 7

文章标签： ai

本文链接：https://blog.csdn.net/2501_90914666/article/details/146079826

版权

文章目录

一、安装

conda create -n e39 python=3.9

conda activate e39

pip install vllm

二、服务

1、vllm serve

vllm serve Qwen/Qwen2.5-1.5B-Instruct

用了 20G

2、Python 服务

from vllm import LLM, SamplingParams  

# llm = LLM(model="facebook/opt-125m")
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct")

prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)  
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

三、访问

curl - 查看所有模型

curl http://10.0.1.23:8000/v1/models

{
	"object": "list",
	"data": [{
		"id": "Qwen/Qwen2.5-1.5B-Instruct",
		"object": "model",
		"created": 1741263998,
		"owned_by": "vllm",
		"root": "Qwen/Qwen2.5-1.5B-Instruct",
		"parent": null,
		"max_model_len": 32768,
		"permission": [{
			"id": "modelperm-79dc42186c2f46d085c3c98615f71e47",
			"object": "model_permission",
			"created": 1741263998,
			"allow_create_engine": false,
			"allow_sampling": true,
			"allow_logprobs": true,
			"allow_search_indices": false,
			"allow_view": true,
			"allow_fine_tuning": false,
			"organization": "*",
			"group": null,
			"is_blocking": false
		}]
	}]
}

curl - completions

curl http://10.0.1.23:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-1.5B-Instruct",
        "prompt": "San Francisco is a",
        "max_tokens": 7,
        "temperature": 0
    }'

{
	"id": "cmpl-3fba4fc307e04e3f8d656049437be215",
	"object": "text_completion",
	"created": 1741264121,
	"model": "Qwen/Qwen2.5-1.5B-Instruct",
	"choices": [{
		"index": 0,
		"text": " city in the state of California,",
		"logprobs": null,
		"finish_reason": "length",
		"stop_reason": null,
		"prompt_logprobs": null
	}],
	"usage": {
		"prompt_tokens": 4,
		"total_tokens": 11,
		"completion_tokens": 7,
		"prompt_tokens_details": null
	}
}

curl - chat/completions

curl http://10.0.1.23:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-1.5B-Instruct",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Who won the world series in 2020?"}
        ]
    }'

{
	"id": "chatcmpl-eaa9d73518ae4900ac3ca03445b94856",
	"object": "chat.completion",
	"created": 1741264227,
	"model": "Qwen/Qwen2.5-1.5B-Instruct",
	"choices": [{
		"index": 0,
		"message": {
			"role": "assistant",
			"reasoning_content": null,
			"content": "The World Series in 2020 was played between the New York Yankees and the Boston Red Sox. The Yankees won in seven games, defeating the Red Sox across the regular and季后赛 (playoffs) seasons.",
			"tool_calls": []
		},
		"logprobs": null,
		"finish_reason": "stop",
		"stop_reason": null
	}],
	"usage": {
		"prompt_tokens": 31,
		"total_tokens": 76,
		"completion_tokens": 45,
		"prompt_tokens_details": null
	},
	"prompt_logprobs": null
}

Python - completions

from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://10.0.1.23:8000/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
                                      prompt="San Francisco is a")
print("Completion result:", completion)

Completion result: Completion(
  id='cmpl-accdc1011da548a08ac8f297146e7791', 
  choices=[
    CompletionChoice(
      finish_reason='length', index=0, logprobs=None, 
      text=' great place with a lot of rich history. But in recent years, the people', 
      stop_reason=None, prompt_logprobs=None)
  ], 
  created=1741264345, model='Qwen/Qwen2.5-1.5B-Instruct', 
  object='text_completion', system_fingerprint=None, 
  usage=CompletionUsage(
    completion_tokens=16, prompt_tokens=4, 
    total_tokens=20, completion_tokens_details=None, 
    prompt_tokens_details=None))

Python - chat.completions

from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://10.0.1.23:8000/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
    model="Qwen/Qwen2.5-1.5B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Tell me a joke."},
    ]
)

print("Chat response:", chat_response)

Chat response: ChatCompletion(
  id='chatcmpl-ae641c01fc54450b9a02bba0260c445b', 
  choices=[Choice(
    finish_reason='stop', index=0, logprobs=None, 
    message=ChatCompletionMessage(
      content='Why could the statue of liberty sleep 8 hours a day?\n\nBecause she had a full moon each month.', 
      refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=None)], 
  created=1741264394, model='Qwen/Qwen2.5-1.5B-Instruct', 
  object='chat.completion', service_tier=None, system_fingerprint=None, 
  usage=CompletionUsage(
    completion_tokens=23, prompt_tokens=24, total_tokens=47, 
    completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)