通义千问(1.8B-int4和7B-int4测试都能用)模型部署
官方文档链接:
https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary
1. 环境配置
这里我列几个主要的环境和版本
python 3.8.18
gradio 4.8.0
modelscope 1.9.5
pytorch 2.1.0
tensorflow 2.13.0
transformers 4.32.0
依赖项:
安装transformer库:
pip install transformers==4.32.0 accelerate tiktoken einops scipy transformers_stream_generator==0.0.4 peft deepspeed -i https://pypi.tuna.tsinghua.edu.cn/simple
安装flash-attention
库,(官方推荐安装)
git clone https://github.com/Dao-AILab/flash-attention
cd flash-attention && pip install .
# 下方安装可选,安装可能比较缓慢。
# pip install csrc/layer_norm
# pip install csrc/rotary
其它的包基本都没啥太大影响,就不一一举例了。
2. 模型下载
from modelscope.hub.snapshot_download import snapshot_download
#如果需要其他版本的,修改路径即可
model_dir = snapshot_download('qwen/Qwen-1_8B-Chat', cache_dir='/home/sc/vscode/qwen1.8b/model', revision='master')
可以下载任意版本的,-int4版本的也行
3. 模型调用
这是一段测试代码,用于测试下载的模型能否正常翻译:
from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("./model/qwen/Qwen-1_8B-Chat", revision='master', trust_remote_code=True)
# use bf16
# model = AutoModelForCausalLM.from_pretrained("./model/qwen/Qwen-1_8B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
# use fp16
# model = AutoModelForCausalLM.from_pretrained("./model/qwen/Qwen-1_8B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
# use cpu only
# model = AutoModelForCausalLM.from_pretrained("./model/qwen/Qwen-1_8B-Chat", device_map="cpu", trust_remote_code=True).eval()
# use auto mode, automatically select precision based on the device.
model = AutoModelForCausalLM.from_pretrained("./model/qwen/Qwen-1_8B-Chat", revision='master', device_map="auto", trust_remote_code=True).eval()
# 第一轮对话 1st dialogue turn
response, history = model.chat(tokenizer, "你好", history=None)
print(response)
# 第二轮对话 2nd dialogue turn
response, history = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", history=history)
print(response)
# 第三轮对话 3rd dialogue turn
response, history = model.chat(tokenizer, "给这个故事起一个标题", history=history)
print(response)
4. 模型部署(基于gradio)
这是基于gradio 部署,后续会考虑部署Flask或者FastAPI
from modelscope import AutoTokenizer,AutoModelForCausalLM
import gradio as gr
from loguru import logger
def load_model():
global tokenizer, model
tokenizer = AutoTokenizer.from_pretrained("./model/qwen/Qwen-1_8B-Chat-Int4", revision='master', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"./model/qwen/Qwen-1_8B-Chat-Int4", revision='master',
device_map="auto",
trust_remote_code=True
).eval()
def predict(input, chatbot, history):
chatbot.append((input,""))
response, history = model.chat(tokenizer, input, history)
chatbot[-1] = (input, response)
yield chatbot, history
def reset_user_input():
return gr.update(value='')
# def reset_state():
# model.clean_history()
# return []
if __name__ == "__main__":
load_model()
with gr.Blocks() as demo:
gr.HTML("""<h1 align="center">qwen1.8b</h1>""")
chatbot = gr.Chatbot()
with gr.Row():
with gr.Column(scale=4):
with gr.Column(scale=12):
user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10)
with gr.Column(min_width=32, scale=1):
submitBtn = gr.Button("Submit", variant="primary")
# with gr.Column(scale=1):
# emptyBtn = gr.Button("Clear History")
history = gr.State([])
past_key_values = gr.State(None)
submitBtn.click(predict, [user_input, chatbot, history],[chatbot, history], show_progress=True)
submitBtn.click(reset_user_input, [], [user_input])
# emptyBtn.click(reset_state, outputs=[chatbot, history, past_key_values], show_progress=True)
demo.queue().launch(share=False, inbrowser=True, server_name='0.0.0.0')