vLLM 用于 LLM 推理服务和客户端的访问方式 3 - 流式

最新推荐文章于 2025-04-14 20:15:00 发布

二分掌柜的

最新推荐文章于 2025-04-14 20:15:00 发布

阅读量1.5k

点赞数 13

分类专栏：大模型文章标签：大模型

本文链接：https://blog.csdn.net/flyfish1986/article/details/143426154

版权

大模型专栏收录该内容

231 篇文章

订阅专栏

vLLM 用于 LLM 推理服务和客户端的访问方式 3 - 流式

flyfish

流式访问

包括如何解析

import requests
import json

# 配置
BASE_URL = "http://0.0.0.0:8000/v1"
API_KEY = "token-abc123"
MODEL_NAME = "LLM-Research/Meta-Llama-3-8B-Instruct"
INPUT_CONTENT = "Who are you?"

# 构建请求头
HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {API_KEY}'
}

# 构建请求体
DATA = {
    "model": MODEL_NAME,
    "messages": [
        {"role": "user", "content": INPUT_CONTENT}
    ],
    "top_k": 50,  # 控制生成文本时考虑的最高概率词汇的数量
    "temperature": 0.7,  # 控制生成文本的随机性
    "max_tokens": 10,  # 生成的最大令牌数
    "presence_penalty": 0.1,  # 控制重复惩罚
    "frequency_penalty": 0.1,  # 控制频率惩罚
    "stream": True
}

def send_request(url, headers, data):
    """发送 POST 请求并返回响应"""
    response = requests.post(url, headers=headers, json=data, stream=True)
    return response

def process_stream_response(response):
    """处理流式响应并实时输出生成的文本"""

    generated_text = ""
    for line in response.iter_lines():
        if line:
            decoded_line = line.decode('utf-8')
            print(f"Received data: {decoded_line}")
            if decoded_line.startswith('data:'):
                json_data = decoded_line[len('data:'):].strip()
                if json_data == '[DONE]':
                    break
                data = json.loads(json_data)
                choices = data.get('choices', [])
                if choices:
                    delta = choices[0].get('delta', {})
                    new_text = delta.get('content', '')
                    generated_text += new_text
                    print(new_text, end='')
    return generated_text



def main():
    url = f"{BASE_URL}/chat/completions"
    response = send_request(url, HEADERS, DATA)
    
    if response.status_code == 200:
        generated_text = process_stream_response(response)
        print("Final Generated Text:", generated_text)
    else:
        print(f"Request failed with status code {response.status_code}: {response.text}")

if __name__ == "__main__":
    main()

接收数据

Received data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
Received data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":"I"},"logprobs":null,"finish_reason":null}]}
IReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" am"},"logprobs":null,"finish_reason":null}]}
 amReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" L"},"logprobs":null,"finish_reason":null}]}
 LReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":"La"},"logprobs":null,"finish_reason":null}]}
LaReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":"MA"},"logprobs":null,"finish_reason":null}]}
MAReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":","},"logprobs":null,"finish_reason":null}]}
,Received data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" an"},"logprobs":null,"finish_reason":null}]}
 anReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" AI"},"logprobs":null,"finish_reason":null}]}
 AIReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" assistant"},"logprobs":null,"finish_reason":null}]}
 assistantReceived data: data: {"id":"chat-1","object":"chat.completion.chunk","created":1730428960,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" developed"},"logprobs":null,"finish_reason":"length","stop_reason":null}]}
 developedReceived data: data: [DONE]
Final Generated Text: I am LLaMA, an AI assistant developed

在这里插入图片描述

流式接收

data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":"I"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" am"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" L"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":"La"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":"MA"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":","},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" an"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" AI"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" assistant"},"logprobs":null,"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1730429430,"model":"LLM-Research/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"delta":{"content":" developed"},"logprobs":null,"finish_reason":"length","stop_reason":null}]}
data: [DONE]
Final Generated Text: I am LLaMA, an AI assistant developed

解析后

I
 am
 L
La
MA
,
 an
 AI
 assistant
 developed