大模型推理并发和吞吐测试

木铎一心

已于 2024-06-19 15:21:20 修改

阅读量1.6k

点赞数 3

文章标签： python 语言模型

于 2024-06-19 15:20:28 首次发布

本文链接：https://blog.csdn.net/Kd_Mpl/article/details/139804279

版权

针对大模型推理场景下的吞吐、首token推理时延和并发进行了测试，欢迎各位大佬指导

import requests  
import time  
from collections import deque  
  
# API配置  
API_URL = "http://0.0.0.0:1025/v1/chat/completions"  # 例如: "https://api.example.com/completions"  
API_KEY = "EMPTY"  # 如果有的话  
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}  
PROMPT = "随机生成512个字"  # 你的测试提示  
  
# 发送单个请求的函数  
def send_request(prompt):  
    start_time = time.time()  
    response = requests.post(  
        API_URL,  
        headers=HEADERS,  
        json={"model": "qwen1.5-110b-chat", "messages": [{ "role": "user", "content": PROMPT}], "max_tokens": 512}  # 根据你的API调整这些参数  
    )  
    response.raise_for_status()  # 如果响应状态码不是200，则抛出HTTPError异常  
    end_time = time.time()  
    return end_time - start_time,start_time, response.json()   # 假设返回格式与OpenAI类似  
  
# 测试吞吐量和首个token推理时长的函数  
def test_api_performance(num_requests=100, concurrency=1):  
    total_time = 0  
    total_sql = 0
    first_token_times = deque(maxlen=num_requests)  # 使用deque来限制大小并保存最新的num_requests个时间  
  
    # 使用并发（如果需要）  
    import concurrent.futures  
  
    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:  
        futures = [executor.submit(send_request, PROMPT) for _ in range(num_requests)]  
  
        for future in concurrent.futures.as_completed(futures):  
            response_time,start_time, text = future.result()  
            total_time += response_time 
            msg = text.get("choices")[0]["message"]["content"]
            completion_tokens = text.get("usage")["completion_tokens"]
            total_sql += completion_tokens
  
            # 假设生成的文本的第一个token是响应的第一个字符（这可能需要根据实际情况进行调整）  
            first_token_time = time.time() - (response_time + start_time)  # 这里的start_time是上一个请求的start_time  
            first_token_times.append(first_token_time)  
  
    # 计算平均推理时间和首个token的平均时间  
    avg_response_time = total_time / num_requests  
    avg_first_token_time = sum(first_token_times) / len(first_token_times)  
    avg_throughput =  total_sql / total_time * concurrency
    print(total_sql)
    print(f"Average Response Time: {avg_response_time:.4f} seconds")  
    print(f"Average First Token Time: {avg_first_token_time:.4f} seconds")  
    print(f"Average throughput : {avg_throughput:.4f} tokens / seconds")
    print(text) 
# 运行测试  
test_api_performance(num_requests=1, concurrency=1)  # 例如，发送100个请求，并发10个线程