Ref: Qwen官方文档
一、数据准备
本文使用的数据集是中文医疗问答数据集,节选其中内科7644条数据(sample_IM5000-6000.csv)。
原始数据示例:
我们需要将数据转换为 Qwen 适用的 json 数据,示例如下:
[
{
"id": "identity_0",
"conversations": [
{
"from": "user",
"value": "心血管科问诊,高血压患者能吃党参吗?,我有高血压这两天女婿来的时候给我拿了些党参泡水喝,您好高血压可以吃党参吗?"
},
{
"from": "assistant",
"value": "高血压病人可以口服党参的。党参有降血脂,降血压的作用,可以彻底消除血液中的垃圾,从而对冠心病以及心血管疾病的患者都有一定的稳定预防工作作用,因此平时口服党参能远离三高的危害。另外党参除了益气养血,降低中枢神经作用,调整消化系统功能,健脾补肺的功能。感谢您的进行咨询,期望我的解释对你有所帮助。"
}
]
}
]
具体转换代码(划分80%训练集+20%验证集):
# coding=gb18030
import pandas as pd
import json
from sklearn.model_selection import train_test_split
# 读取CSV文件
csv_file_path = 'datasets/qwen/sample_IM5000-6000.csv'
df = pd.read_csv(csv_file_path, encoding='ANSI')
# 划分数据集
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
# 转换训练集数据
train_data_list = []
for index, row in train_df.iterrows():
department = row['department'] + '问诊'
title = row['title']
ask = row['ask']
answer = row['answer']
user_value = f"{department},{title}{ask}"
conversations = [
{"from": "user", "value": user_value},
{"from": "assistant", "value": answer}
]
entry = {
"id": f"identity_{index}",
"conversations": conversations
}
train_data_list.append(entry)
# 将训练集数据序列化为JSON格式
train_json_data = json.dumps(train_data_list, indent=4, ensure_ascii=False)
# 写入训练集JSON文件
train_json_file_path = './datasets/qwen/train_sample_IM5000-6000.json'
with open(train_json_file_path, 'w', encoding='utf-8') as json_file:
json_file.write(train_json_data)
# 转换验证集数据
val_data_list = []
for index, row in val_df.iterrows():
department = row['department'] + '问诊'
title = row['title']
ask = row['ask']
answer = row['answer']
user_value = f"{department},{title}{ask}"
conversations = [
{"from": "user", "value": user_value},
{"from": "assistant", "value": answer}
]
entry = {
"id": f"identity_{index}",
"conversations": conversations
}
val_data_list.append(entry)
# 将验证集数据序列化为JSON格式
val_json_data = json.dumps(val_data_list, indent=4, ensure_ascii=False)
# 写入验证集JSON文件
val_json_file_path = './datasets/qwen/val_sample_IM5000-6000.json'
with open(val_json_file_path, 'w', encoding='utf-8') as json_file:
json_file.write(val_json_data)
print(f"Train data has been successfully converted to JSON format and saved to {train_json_file_path}")
print(f"Validation data has been successfully converted to JSON format and saved to {val_json_file_path}")
二、模型下载与微调训练
1. 下载源码
git clone https://github.com/QwenLM/Qwen.git
2. 配置环境
pip install -r requirements.txt
pip install mpi4py -i http://pypi.doubanio.com/simple/ --trusted-host pypi.doubanio.com
# 出现下列错误的话,用 conda 安装 mpi4py
# ERROR: Could not build wheels for mpi4py, which is required to install pyproject.toml-based projects
sudo apt update
sudo apt-get install libopenmpi-dev
# 执行以上两条命令后, 重新下载安装mpi4py命令,发现还是安装失败,改用conda的方式安装
conda install mpi4py
pip install "peft<0.8.0" deepspeed
pip install optimum
pip install auto-gptq
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple auto-gptq
pip install tiktoken
pip install modelscope
3. 模型下载(无法科学上网,可以通过 modelscope 下载,需 pip install modelscope)
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('Qwen/Qwen-7B-Chat-Int4', local_dir="/root/Qwen-7B-Chat-Int4")
4. 微调训练
修改微调训练脚本 finetune_qlora_single_gpu.sh 中的模型和数据路径,
MODEL="/root/Qwen-7B-Chat-Int4"
DATA="/root/datasets/qwen/val_sample_IM5000-6000.json"
运行微调训练的脚本 。
bash finetune/finetune_qlora_single_gpu.sh
5. 推理
# 推理
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import json
# 加载预训练的分词器
tokenizer = AutoTokenizer.from_pretrained("/root/Qwen-7B-Chat-Int4") # 这里需要替换成实际的Qwen模型ID
# 加载微调后的模型
model = AutoPeftModelForCausalLM.from_pretrained(
"output_qwen/test1", # 微调模型的路径
device_map="auto",
trust_remote_code=True
).eval()
# 假设你的JSON数据存储在一个名为data.json的文件中
with open('datasets/qwen/test.json', 'r', encoding='utf-8') as file:
test_set = json.load(file)
for dialog in test_set:
conversations = dialog["conversations"]
history = None
for i, conv in enumerate(conversations):
if conv["from"] == "user":
# 使用模型生成回答
response, history = model.chat(tokenizer, conv["value"], history=history)
# 将回答存储到下一个assistant字段
if i + 1 < len(conversations) and conversations[i + 1]["from"] == "assistant":
conversations[i + 1]["value"] = response
# 打印更新后的测试集
for dialog in test_set:
print(dialog)
# 将更新后的数据集保存到新的JSON文件中
with open('updated_data.json', 'w', encoding='utf-8') as file:
json.dump(test_set, file, ensure_ascii=False, indent=4)
print("数据已成功更新并保存到 updated_data.json 文件。")