启动推理服务:
#启动服务
port=8000
python3 -m vllm.entrypoints.openai.api_server \
--model $model_path --tensor-parallel-size=2 --trust-remote-code \
--gpu-memory-utilization=0.8 \
--enforce-eager --max-model-len=8192 \
--served-model-name='your model name' >> _output.log &
用几张卡tensor-parallel-size就写几,served-model-name在接下来要启动的infer脚本里的model一致(只要字符相同即可)
# 等待推理服务启动,设置20分钟超时
timeout_limit_seconds=1200 # 20分钟
remaining_timeout_seconds=$timeout_limit_seconds
print_interval=30 # 每30秒打印一次剩余时间
while ! nc -z localhost $port; do
sleep 10
((remaining_timeout_seconds-=10))
if [ $remaining_timeout_seconds -gt 0 ]; then
if [ $((remaining_timeout_seconds % print_interval)) -eq 0 ]; then
echo "等待推理服务启动, 等待剩余时间 $remaining_timeout_seconds 秒"
fi
else
echo "推理服务启动超时"
exit 1
fi
done
echo "推理服务启动成功,耗时 $(($timeout_limit_seconds - $remaining_timeout_seconds)) 秒"
# 启动预测任务
python3 infer_by_api.py --port $port --data_file ${data_file} --output_file ${output_file} --n_jobs ${n_jobs}
echo "预测任务完成"
# 预测任务完成,退出脚本
exit 0
n_jobs是用来多线程调用的
推理python脚本:
import openai
import argparse
import os, json
import concurrent.futures
from tqdm.auto import tqdm
import openai
#训练脚本中有这个转换,因此在这里也加入同样的转换操作:
chat_prompt = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>human\n{}<|im_end|>\n<|im_start|>gpt\n"""
def process_item(one_item):
model = "your model name" # 与调用此python文件的bash脚本中启动vllm的modelname必须一样
prompt = one_item['prompt']
prompt=chat_prompt.format(prompt)
res={'raw_prompt': prompt,"pre_result":[]} #res是最终你得到的结果,把想留的信息都加在这里。"pre_result"是我接收的模型返回的结果,因为我prompt里面写的是返回列表(训练模型的时候已经规范了返回格式)
for k in range(5): #每个问题调用5次,防止只调用一次会失败的情况发生
try:
completion = openai.Completion.create(
model=model,
prompt=prompt,
echo=False,
stop=["<|im_end|>","<|endoftext|>"], #这是千问的
stream=False,
temperature=0.1,
do_sample=True,
max_tokens=4096, #返回结果的最大token数量
top_p=0.5
)
result=completion.to_dict_recursive()
res["pre_result"].append(result)
except Exception as e:
print(e)
continue
return res
def run(args):
if ".jsonl" in args.data_file:
test_data = load_jsonl(args.data_file)
else:
test_data = load_json(args.data_file)
save_dir = os.path.dirname(args.output_file)
os.makedirs(save_dir, exist_ok=True)
n_jobs = min(args.n_jobs, len(test_data)) #100-500
with concurrent.futures.ThreadPoolExecutor(max_workers=n_jobs) as executor:
result_list = list(
tqdm(executor.map(process_item, test_data), total=len(test_data))
)
result_list = [x for x in result_list]
save_json(args.output_file, result_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_file",
type=str,
help="Path to the input data file")
parser.add_argument(
"--output_file",
type=str,
help="Path to the output file")
parser.add_argument(
"--n_jobs",
type=int,
help="Number of threads to use for parallel processing",
default=50)
parser.add_argument(
"--port",
type=int,
help="Port number for the API server",
default=8000,
)
parser.add_argument("--host", type=str, help="Host for the API server", default="localhost")
args = parser.parse_args()
openai.api_key = "EMPTY"
openai.api_base = f"http://{args.host}:{args.port}/v1" #通过这里连接到已经启动的vllm
print(args)
run(args)