from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import sys
print(sys.path)
from flask import *
import pathlib
import os
from transformers.generation.utils import GenerationConfig
os.environ['CUDA_VISIBLE_DEVICES']="3"
FAIL RESPONSE = {'status': 500,'message':'请求失败','data': {}}
def response(results):
if len(results)== 0:
return FAIL_RESPONSE
return {'status': 200,'message':'请求成功','data': {'results': results, 'text': results[0]}}
def loadmodel():
model_name ="your model name"
model = AutoModelForcausalLM.from_pretrained(
model_name,
trust remote_code=True,
Low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
device_map='auto')
model.generation_config = GenerationConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
# llama不支持fast
use_fast=False)
return model,tokenizer
def predict(query, model, tokenizer):
message = query
# print(f"[user] iquery}",flush=True)
# response = model.chat(tokenizer,message,stream=True)
response = model.chat(tokenizer,message,history=None)print("我是predict的输出"response)
if torch.backends.mps.is_available():
torch.mps.empty_cache()
# print('response=',response,end=',flush=True)
return response
root = pathlib.Path(os.path.abspath(file_ )).parent.parent
root_result_file = os.path.join(root,'data')
model,tokenizer =loadmodel()
app = Flask(__name__)# 创建flask应用app.config["JSON_AS_ASCII']= False
print(app.make_config())
@app.route( rule:"/api/v1/qwen/14b/text',methods=['POST'])
def baichuan_7B_predict():
try:
param = request.get_json()
print(param)
# query = param["messages"]
query ="可以用python写一个排序算法吗?"
result = predict(query, model, tokenizer)
print("我是baichuan_7B_predict的输出",result)
resultData =response([result])
return resultData
except Exception as e:
print("exception")
print(str(e))
return FAIL_RESPONSE
if __name__ == '__main__':
app.config.fnom_object(config())
app.run(host='0.0.0.0',port=8500,debug=False)
baichuan_7B_predict()
教你如何用Flask暴露出微调好的大模型接口
最新推荐文章于 2024-09-18 00:32:39 发布
本文介绍了如何使用HuggingFace的Transformers库加载预训练的模型,如yourmodelname,并通过Flask构建一个API接口,实现在接收用户查询后进行文本生成和预测。代码展示了如何处理请求、加载模型、调用模型聊天功能以及返回预测结果。
摘要由CSDN通过智能技术生成