针对大模型推理场景下的吞吐、首token推理时延和并发进行了测试,欢迎各位大佬指导
import requests
import time
from collections import deque
# API配置
API_URL = "http://0.0.0.0:1025/v1/chat/completions" # 例如: "https://api.example.com/completions"
API_KEY = "EMPTY" # 如果有的话
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
PROMPT = "随机生成512个字" # 你的测试提示
# 发送单个请求的函数
def send_request(prompt):
start_time = time.time()
response = requests.post(
API_URL,
headers=HEADERS,
json={"model": "qwen1.5-110b-chat", "messages": [{ "role": "user", "content": PROMPT}], "max_tokens": 512} # 根据你的API调整这些参数
)
response.raise_for_status() # 如果响应状态码不是200,则抛出HTTPError异常
end_time = time.time()
return end_time - start_time,start_time, response.json() # 假设返回格式与OpenAI类似
# 测试吞吐量和首个token推理时长的函数
def test_api_performance(num_requests=100, concurrency=1):
total_time = 0
total_sql = 0
first_token_times = deque(maxlen=num_requests) # 使用deque来限制大小并保存最新的num_requests个时间
# 使用并发(如果需要)
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = [executor.submit(send_request, PROMPT) for _ in range(num_requests)]
for future in concurrent.futures.as_completed(futures):
response_time,start_time, text = future.result()
total_time += response_time
msg = text.get("choices")[0]["message"]["content"]
completion_tokens = text.get("usage")["completion_tokens"]
total_sql += completion_tokens
# 假设生成的文本的第一个token是响应的第一个字符(这可能需要根据实际情况进行调整)
first_token_time = time.time() - (response_time + start_time) # 这里的start_time是上一个请求的start_time
first_token_times.append(first_token_time)
# 计算平均推理时间和首个token的平均时间
avg_response_time = total_time / num_requests
avg_first_token_time = sum(first_token_times) / len(first_token_times)
avg_throughput = total_sql / total_time * concurrency
print(total_sql)
print(f"Average Response Time: {avg_response_time:.4f} seconds")
print(f"Average First Token Time: {avg_first_token_time:.4f} seconds")
print(f"Average throughput : {avg_throughput:.4f} tokens / seconds")
print(text)
# 运行测试
test_api_performance(num_requests=1, concurrency=1) # 例如,发送100个请求,并发10个线程