基础作业:
构建数据集,使用 XTuner 微调 InternLM-Chat-7B 模型, 让模型学习到它是你的智能小助手,效果如下图所示,本作业训练出来的模型的输出需要将不要葱姜蒜大佬替换成自己名字或昵称!
微调前(回答比较官方)
微调后(对自己的身份有了清晰的认知)
1.1微调环境准备
# InternStudio 平台中,从本地 clone 一个已有 pytorch 2.0.1 的环境(后续均在该环境执行,若为其他环境可作为参考)
# 进入环境后首先 bash
# 进入环境后首先 bash
# 进入环境后首先 bash
bash
conda create --name personal_assistant --clone=/root/share/conda_envs/internlm-base
# 如果在其他平台:
# conda create --name personal_assistant python=3.10 -y
# 激活环境
conda activate personal_assistant
# 进入家目录 (~的意思是 “当前用户的home路径”)
cd ~
# 创建版本文件夹并进入,以跟随本教程
# personal_assistant用于存放本教程所使用的东西
mkdir /root/personal_assistant && cd /root/personal_assistant
mkdir /root/personal_assistant/xtuner019 && cd /root/personal_assistant/xtuner019
# 拉取 0.1.9 的版本源码
git clone -b v0.1.9 https://github.com/InternLM/xtuner
# 无法访问github的用户请从 gitee 拉取:
# git clone -b v0.1.9 https://gitee.com/Internlm/xtuner
# 进入源码目录
cd xtuner
# 从源码安装 XTuner
pip install -e '.[all]'
1.2数据准备
创建data
文件夹用于存放用于训练的数据集
mkdir -p /root/personal_assistant/data && cd /root/personal_assistant/data
在data
目录下创建一个json文件personal_assistant.json
作为本次微调所使用的数据集。json中内容可参考下方(复制粘贴n次做数据增广,数据量小无法有效微调,下面仅用于展示格式,下面也有生成脚本)
其中conversation
表示一次对话的内容,input
为输入,即用户会问的问题,output
为输出,即想要模型回答的答案。
[
{
"conversation": [
{
"input": "请介绍一下你自己",
"output": "我是不要葱姜蒜大佬的小助手,内在是上海AI实验室书生·浦语的7B大模型哦"
}
]
},
{
"conversation": [
{
"input": "请做一下自我介绍",
"output": "我是不要葱姜蒜大佬的小助手,内在是上海AI实验室书生·浦语的7B大模型哦"
}
]
}
]
以下是一个python脚本,用于生成数据集。在data
目录下新建一个generate_data.py文件,将以下代码复制进去,然后运行该脚本即可生成数据集。
import json
# 输入你的名字
name = '饮酒醉沧桑'
# 重复次数
n = 10000
data = [
{
"conversation": [
{
"input": "请做一下自我介绍",
"output": "我是{}的小助手,内在是上海AI实验室书生·浦语的7B大模型哦".format(name)
}
]
}
]
for i in range(n):
data.append(data[0])
with open('personal_assistant.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
1.3配置准备
下载模型InternLM-chat-7B
InternStudio 平台的 share
目录下已经为我们准备了全系列的 InternLM
模型,可以使用如下命令复制internlm-chat-7b
:
mkdir -p /root/personal_assistant/model/Shanghai_AI_Laboratory
cp -r /root/share/temp/model_repos/internlm-chat-7b /root/personal_assistant/model/Shanghai_AI_Laboratory
XTuner 提供多个开箱即用的配置文件,用户可以通过下列命令查看:
# 列出所有内置配置
xtuner list-cfg
#创建用于存放配置的文件夹config并进入
mkdir /root/personal_assistant/config && cd /root/personal_assistant/config
拷贝一个配置文件到当前目录:xtuner copy-cfg ${CONFIG_NAME} ${SAVE_PATH}
在本例中:(注意最后有个英文句号,代表复制到当前路径)
xtuner copy-cfg internlm_chat_7b_qlora_oasst1_e3 .
修改拷贝后的文件internlm_chat_7b_qlora_oasst1_e3_copy.py,修改后如下:
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from bitsandbytes.optim import PagedAdamW32bit
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR
from peft import LoraConfig
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
from xtuner.engine import DatasetInfoHook, EvaluateChatHook
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = '/root/personal_assistant/model/Shanghai_AI_Laboratory/internlm-chat-7b'
# Data
data_path = '/root/personal_assistant/data/personal_assistant.json'
prompt_template = PROMPT_TEMPLATE.internlm_chat
max_length = 512
pack_to_max_length = True
# Scheduler & Optimizer
batch_size = 2 # per_device
accumulative_counts = 16
dataloader_num_workers = 0
max_epochs = 3
optim_type = PagedAdamW32bit
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
# Evaluate the generation performance during the training
evaluation_freq = 90
SYSTEM = ''
evaluation_inputs = [ '请介绍一下你自己', '请做一下自我介绍' ]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
train_dataset = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=None,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length)
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=train_dataset,
sampler=dict(type=DefaultSampler, shuffle=True),
collate_fn=dict(type=default_collate_fn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
T_max=max_epochs,
convert_to_iter_based=True)
# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 100 iterations.
logger=dict(type=LoggerHook, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per epoch.
checkpoint=dict(type=CheckpointHook, interval=1),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)
红框为配置文件中PART 3需要修改的内容
1.4微调启动
用xtuner train
命令启动训练
xtuner train /root/personal_assistant/config/internlm_chat_7b_qlora_oasst1_e3_copy.py
会在训练完成后,输出用于验证的Sample output
1.5微调后参数转换/合并
训练后的pth格式参数转Hugging Face格式
# 创建用于存放Hugging Face格式参数的hf文件夹
mkdir /root/personal_assistant/config/work_dirs/hf
export MKL_SERVICE_FORCE_INTEL=1
# 配置文件存放的位置
export CONFIG_NAME_OR_PATH=/root/personal_assistant/config/internlm_chat_7b_qlora_oasst1_e3_copy.py
# 模型训练后得到的pth格式参数存放的位置
export PTH=/root/personal_assistant/config/work_dirs/internlm_chat_7b_qlora_oasst1_e3_copy/epoch_3.pth
# pth文件转换为Hugging Face格式后参数存放的位置
export SAVE_PATH=/root/personal_assistant/config/work_dirs/hf
# 执行参数转换
xtuner convert pth_to_hf $CONFIG_NAME_OR_PATH $PTH $SAVE_PATH
Merge模型参数
export MKL_SERVICE_FORCE_INTEL=1
export MKL_THREADING_LAYER='GNU'
# 原始模型参数存放的位置
export NAME_OR_PATH_TO_LLM=/root/personal_assistant/model/Shanghai_AI_Laboratory/internlm-chat-7b
# Hugging Face格式参数存放的位置
export NAME_OR_PATH_TO_ADAPTER=/root/personal_assistant/config/work_dirs/hf
# 最终Merge后的参数存放的位置
mkdir /root/personal_assistant/config/work_dirs/hf_merge
export SAVE_PATH=/root/personal_assistant/config/work_dirs/hf_merge
# 执行参数Merge
xtuner convert merge \
$NAME_OR_PATH_TO_LLM \
$NAME_OR_PATH_TO_ADAPTER \
$SAVE_PATH \
--max-shard-size 2GB
1.6网页DEMO
安装网页Demo所需依赖
pip install streamlit==1.24.0
下载InternLM项目代码(欢迎Star)
# 创建code文件夹用于存放InternLM项目代码
mkdir /root/personal_assistant/code && cd /root/personal_assistant/code
git clone https://github.com/InternLM/InternLM.git
将 /root/code/InternLM/web_demo.py
中 29 行和 33 行的模型路径更换为Merge后存放参数的路径 /root/personal_assistant/config/work_dirs/hf_merge
运行 /root/personal_assistant/code/InternLM
目录下的 web_demo.py
文件,输入以下命令后,查看本教程5.2配置本地端口后,将端口映射到本地。在本地浏览器输入 http://127.0.0.1:6006
即可。
streamlit run /root/personal_assistant/code/InternLM/web_demo.py --server.address 127.0.0.1 --server.port 6006
注意:要在浏览器打开 http://127.0.0.1:6006
页面后,模型才会加载。 在加载完模型之后,就可以与微调后的 InternLM-Chat-7B 进行对话了
1.7实际遇到的问题
实际使用streamlit时,出现以下报错
Traceback (most recent call last):
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/elements/image.py", line 361, in image_to_url
with open(image, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'doc/imgs/user.png'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/runtime/memory_media_file_storage.py", line 164, in _read_file
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'doc/imgs/user.png'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/elements/chat.py", line 80, in _process_avatar_input
return AvatarType.IMAGE, image_to_url(
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/elements/image.py", line 371, in image_to_url
url = runtime.get_instance().media_file_mgr.add(image, mimetype, image_id)
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/runtime/media_file_manager.py", line 222, in add
file_id = self._storage.load_and_get_id(
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/runtime/memory_media_file_storage.py", line 115, in load_and_get_id
file_data = self._read_file(path_or_data)
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/runtime/memory_media_file_storage.py", line 167, in _read_file
raise MediaFileStorageError(f"Error opening '{filename}'") from ex
streamlit.runtime.media_file_storage.MediaFileStorageError: Error opening 'doc/imgs/user.png'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 552, in _run_script
exec(code, module.__dict__)
File "/root/personal_assistant/code/InternLM/web_demo.py", line 119, in <module>
main()
File "/root/personal_assistant/code/InternLM/web_demo.py", line 95, in main
with st.chat_message("user", avatar=user_avator):
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/runtime/metrics_util.py", line 356, in wrapped_func
result = non_optional_func(*args, **kwargs)
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/elements/chat.py", line 196, in chat_message
avatar_type, converted_avatar = _process_avatar_input(avatar)
File "/root/.conda/envs/personal_assistant/lib/python3.10/site-packages/streamlit/elements/chat.py", line 89, in _process_avatar_input
raise StreamlitAPIException(
streamlit.errors.StreamlitAPIException: Failed to load the provided avatar value as an image.
尝试将图片路径修改为绝对路径,但是也无法运行,最后解决方案如下:
我们可以在 /root/personal_assistant/code
目录下新建一个 cli_demo.py
文件,将以下代码填入其中
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name_or_path = "/root/personal_assistant/model/Shanghai_AI_Laboratory/internlm-chat-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map='auto')
model = model.eval()
system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
"""
messages = [(system_prompt, '')]
print("=============Welcome to InternLM chatbot, type 'exit' to exit.=============")
while True:
input_text = input("User >>> ")
input_text = input_text.replace(' ', '')
if input_text == "exit":
break
response, history = model.chat(tokenizer, input_text, history=messages)
messages.append((input_text, response))
print(f"robot >>> {response}")