部署 InternLM2-Chat-1.8B
模型进行智能对话
1 配置基础环境
首先,打开 Intern Studio
界面,点击 创建开发机 配置开发机系统。
填写 开发机名称
后,点击 选择镜像 使用 Cuda11.7-conda
镜像,然后在资源配置中,使用 10% A100 * 1
的选项,然后立即创建开发机器。
点击 进入开发机
选项。
进入开发机后,在 terminal
中输入环境配置命令 (配置环境时间较长,需耐心等待):
studio-conda -o internlm-base -t demo # 与 studio-conda 等效的配置方案 # conda create -n demo python==3.10 -y # conda activate demo # conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
配置完成后,进入到新创建的 conda
环境之中:
conda activate demo
输入以下命令,完成环境包的安装:
pip install huggingface-hub==0.17.3 pip install transformers==4.34 pip install psutil==5.9.8 pip install accelerate==0.24.1 pip install streamlit==1.32.2 pip install matplotlib==3.8.3 pip install modelscope==1.9.5 pip install sentencepiece==0.1.99
2 下载 InternLM2-Chat-1.8B
模型
按路径创建文件夹,并进入到对应文件目录中:
mkdir -p /root/demo touch /root/demo/cli_demo.py touch /root/demo/download_mini.py cd /root/demo
通过左侧文件夹栏目,双击进入 demo
文件夹。
双击打开 /root/demo/download_mini.py
文件,复制以下代码:
import os from modelscope.hub.snapshot_download import snapshot_download # 创建保存模型目录 os.system("mkdir /root/models") # save_dir是模型保存到本地的目录 save_dir="/root/models" snapshot_download("Shanghai_AI_Laboratory/internlm2-chat-1_8b", cache_dir=save_dir, revision='v1.1.0')
执行命令,下载模型参数文件:
python /root/demo/download_mini.py
3 运行 cli_demo
双击打开 /root/demo/cli_demo.py
文件,复制以下代码:
import torch from transformers import AutoTokenizer, AutoModelForCausalLM model_name_or_path = "/root/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, device_map='cuda:0') model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map='cuda:0') model = model.eval() system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语). - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. """ messages = [(system_prompt, '')] print("=============Welcome to InternLM chatbot, type 'exit' to exit.=============") while True: input_text = input("\nUser >>> ") input_text = input_text.replace(' ', '') if input_text == "exit": break length = 0 for response, _ in model.stream_chat(tokenizer, input_text, messages): if response is not None: print(response[length:], flush=True, end="") length = len(response)
输入命令,执行 Demo 程序:
conda activate demo python /root/demo/cli_demo.py
等待模型加载完成,键入内容示例:
请创作一个 300 字的小故事
二、部署 modelscope 优秀作品八戒-Chat-1.8B
1.下载模型
import os
#模型下载
from modelscope.hub.snapshot_download import snapshot_download
# 创建保存模型目录
os.system("mkdir -p /root/models")
# save_dir是模型保存到本地的目录
save_dir="/root/models"
snapshot_download('JimmyMa99/BaJie-Chat-mini',
cache_dir=save_dir)
2.使用 Gradio 进行对话
下载提供的 chat.py文件
# isort: skip_file
import copy
import warnings
from dataclasses import asdict, dataclass
from typing import Callable, List, Optional
import streamlit as st
import torch
from torch import nn
from transformers.generation.utils import (LogitsProcessorList,
StoppingCriteriaList)
from transformers.utils import logging
from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip
logger = logging.get_logger(__name__)
@dataclass
class GenerationConfig:
# this config is used for chat to provide more diversity
max_length: int = 32768
top_p: float = 0.8
temperature: float = 0.8
do_sample: bool = True
repetition_penalty: float = 1.005
@torch.inference_mode()
def generate_interactive(
model,
tokenizer,
prompt,
generation_config: Optional[GenerationConfig] = None,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor],
List[int]]] = None,
additional_eos_token_id: Optional[int] = None,
**kwargs,
):
inputs = tokenizer([prompt], padding=True, return_tensors='pt')
input_length = len(inputs['input_ids'][0])
for k, v in inputs.items():
inputs[k] = v.cuda()
input_ids = inputs['input_ids']
_, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
if generation_config is None:
generation_config = model.generation_config
generation_config = copy.deepcopy(generation_config)
model_kwargs = generation_config.update(**kwargs)
bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612
generation_config.bos_token_id,
generation_config.eos_token_id,
)
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
if additional_eos_token_id is not None:
eos_token_id.append(additional_eos_token_id)
has_default_max_length = kwargs.get(
'max_length') is None and generation_config.max_length is not None
if has_default_max_length and generation_config.max_new_tokens is None:
warnings.warn(
f"Using 'max_length''s default ({repr(generation_config.max_length)}) \
to control the generation length. "
'This behaviour is deprecated and will be removed from the \
config in v5 of Transformers -- we'
' recommend using `max_new_tokens` to control the maximum \
length of the generation.',
UserWarning,
)
elif generation_config.max_new_tokens is not None:
generation_config.max_length = generation_config.max_new_tokens + \
input_ids_seq_length
if not has_default_max_length:
logger.warn( # pylint: disable=W4902
f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) "
f"and 'max_length'(={generation_config.max_length}) seem to "
"have been set. 'max_new_tokens' will take precedence. "
'Please refer to the documentation for more information. '
'(https://huggingface.co/docs/transformers/main/'
'en/main_classes/text_generation)',
UserWarning,
)
if input_ids_seq_length >= generation_config.max_length:
input_ids_string = 'input_ids'
logger.warning(
f"Input length of {input_ids_string} is {input_ids_seq_length}, "
f"but 'max_length' is set to {generation_config.max_length}. "
'This can lead to unexpected behavior. You should consider'
" increasing 'max_new_tokens'.")
# 2. Set generation parameters if not already defined
logits_processor = logits_processor if logits_processor is not None \
else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None \
else StoppingCriteriaList()
logits_processor = model._get_logits_processor(
generation_config=generation_config,
input_ids_seq_length=input_ids_seq_length,
encoder_input_ids=input_ids,
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
logits_processor=logits_processor,
)
stopping_criteria = model._get_stopping_criteria(
generation_config=generation_config,
stopping_criteria=stopping_criteria)
logits_warper = model._get_logits_warper(generation_config)
unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
scores = None
while True:
model_inputs = model.prepare_inputs_for_generation(
input_ids, **model_kwargs)
# forward pass to get next token
outputs = model(
**model_inputs,
return_dict=True,
output_attentions=False,
output_hidden_states=False,
)
next_token_logits = outputs.logits[:, -1, :]
# pre-process distribution
next_token_scores = logits_processor(input_ids, next_token_logits)
next_token_scores = logits_warper(input_ids, next_token_scores)
# sample
probs = nn.functional.softmax(next_token_scores, dim=-1)
if generation_config.do_sample:
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
else:
next_tokens = torch.argmax(probs, dim=-1)
# update generated ids, model inputs, and length for next step
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
model_kwargs = model._update_model_kwargs_for_generation(
outputs, model_kwargs, is_encoder_decoder=False)
unfinished_sequences = unfinished_sequences.mul(
(min(next_tokens != i for i in eos_token_id)).long())
output_token_ids = input_ids[0].cpu().tolist()
output_token_ids = output_token_ids[input_length:]
for each_eos_token_id in eos_token_id:
if output_token_ids[-1] == each_eos_token_id:
output_token_ids = output_token_ids[:-1]
response = tokenizer.decode(output_token_ids)
yield response
# stop when each sentence is finished
# or if we exceed the maximum length
if unfinished_sequences.max() == 0 or stopping_criteria(
input_ids, scores):
break
def on_btn_click():
del st.session_state.messages
@st.cache_resource
def load_model():
model = (AutoModelForCausalLM.from_pretrained('/root/models/JimmyMa99/BaJie-Chat-mini',
trust_remote_code=True).to(
torch.bfloat16).cuda())
tokenizer = AutoTokenizer.from_pretrained('/root/models/JimmyMa99/BaJie-Chat-mini',
trust_remote_code=True)
return model, tokenizer
def prepare_generation_config():
with st.sidebar:
max_length = st.slider('Max Length',
min_value=8,
max_value=32768,
value=32768)
top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01)
temperature = st.slider('Temperature', 0.0, 1.0, 0.7, step=0.01)
st.button('Clear Chat History', on_click=on_btn_click)
generation_config = GenerationConfig(max_length=max_length,
top_p=top_p,
temperature=temperature)
return generation_config
user_prompt = '<|im_start|>user\n{user}<|im_end|>\n'
robot_prompt = '<|im_start|>assistant\n{robot}<|im_end|>\n'
cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\
<|im_start|>assistant\n'
def combine_history(prompt):
messages = st.session_state.messages
meta_instruction = ('你是猪八戒,猪八戒说话幽默风趣,说话方式通常表现为直率、幽默,有时带有一点自嘲和调侃。'
'你的话语中常常透露出对食物的喜爱和对安逸生活的向往,同时也显示出他机智和有时的懒惰特点。'
'尽量保持回答的自然回答,当然你也可以适当穿插一些文言文,另外,书生·浦语是你的好朋友,是你的AI助手。')
total_prompt = f"<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"
for message in messages:
cur_content = message['content']
if message['role'] == 'user':
cur_prompt = user_prompt.format(user=cur_content)
elif message['role'] == 'robot':
cur_prompt = robot_prompt.format(robot=cur_content)
else:
raise RuntimeError
total_prompt += cur_prompt
total_prompt = total_prompt + cur_query_prompt.format(user=prompt)
return total_prompt
def main():
# torch.cuda.empty_cache()
print('load model begin.')
model, tokenizer = load_model()
print('load model end.')
st.title('猪猪Chat-InternLM2')
generation_config = prepare_generation_config()
# Initialize chat history
if 'messages' not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message['role']):
st.markdown(message['content'])
# Accept user input
if prompt := st.chat_input('What is up?'):
# Display user message in chat message container
with st.chat_message('user'):
st.markdown(prompt)
real_prompt = combine_history(prompt)
# Add user message to chat history
st.session_state.messages.append({
'role': 'user',
'content': prompt,
})
with st.chat_message('robot'):
message_placeholder = st.empty()
for cur_response in generate_interactive(
model=model,
tokenizer=tokenizer,
prompt=real_prompt,
additional_eos_token_id=92542,
**asdict(generation_config),
):
# Display robot response in chat message container
message_placeholder.markdown(cur_response + '▌')
message_placeholder.markdown(cur_response)
# Add robot response to chat history
st.session_state.messages.append({
'role': 'robot',
'content': cur_response, # pylint: disable=undefined-loop-variable
})
torch.cuda.empty_cache()
if __name__ == '__main__':
main()
运行 + 本地端口配置:
- 使用 streamlit 启动:streamlit run bajie_chat.py --server.address 127.0.0.1 --server.port 6006
- 本地端口映射:ssh -CNg -L 6006:127.0.0.1:6006 root@ssh.intern-ai.org.cn -p 40514
本地端口映射:ssh -CNg -L 6006:127.0.0.1:6006 root@ssh.intern-ai.org.cn -p 40514
点击链接的时候才会去加载模型。
对西游记了解不多,可能是微调的时候没有使用西游记相关的数据集。
三、使用 InternLM2-Chat-7B 运行 Lagent 智能体
1.下载模型(这里不需要下载,只需要软连接一下即可)
ln -s /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-7b /root/models/internlm2-chat-7b
2.下载 Lagent(使用源码的方式安装)
git clone https://gitee.com/internlm/lagent.git
进入 lagent 目录:
pip install -e . # 源码安装
3.使用examples/internlm2_agent_web_demo_hf.py启动
streamlit run internlm2_agent_web_demo_hf.py --server.address 127.0.0.1 --server.port 6006
一样的,本地映射:
ssh -CNg -L 6006:127.0.0.1:6006 root@ssh.intern-ai.org.cn -p 40514
加载完成大概要 16GB 的显存
尝试一下不同的选择:
不使用插件很慢,但是使用显存变少