qwenty

白桃乌龙奶冻

已于 2024-08-02 13:12:20 修改

阅读量73

点赞数 2

文章标签： python

于 2024-08-01 17:21:40 首次发布

本文链接：https://blog.csdn.net/qq_42248452/article/details/140852931

版权

# pip install mpi4py -i http://pypi.doubanio.com/simple/ --trusted-host pypi.doubanio.com
# # ERROR: Could not build wheels for mpi4py, which is required to install pyproject.toml-based projects，出现此错误的话，conda安装
# sudo apt update
# sudo apt-get install libopenmpi-dev
# # 执行以上两条命令后， 重新下载安装mpi4py命令，发现还是安装失败，改用conda的方式安装
# conda install mpi4py

# pip install optimum
# pip install auto-gptq
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple auto-gptq
# pip install tiktoken


#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('Qwen/Qwen-7B-Chat-Int4', local_dir="/root/Qwen-7B-Chat-Int4")


# 微调训练
# MODEL="/root/Qwen-7B-Chat-Int4" 
# DATA="/root/datasets/qwen/val_sample_IM5000-6000.json"

# bash finetune/finetune_lora_single_gpu.sh


# 推理
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import json

# 加载预训练的分词器
tokenizer = AutoTokenizer.from_pretrained("/root/Qwen-7B-Chat-Int4")  # 这里需要替换成实际的Qwen模型ID

# 加载微调后的模型
model = AutoPeftModelForCausalLM.from_pretrained(
    "output_qwen/test1",  # 微调模型的路径
    device_map="auto",
    trust_remote_code=True
).eval()

# 假设你的JSON数据存储在一个名为data.json的文件中
with open('datasets/qwen/test.json', 'r', encoding='utf-8') as file:
    test_set = json.load(file)

# 引入你的模型和tokenizer
# model 和 tokenizer 应该是你预先训练好的模型和相应的分词器
# 这里我们假设它们已经被定义并且可用
# from your_model_module import model, tokenizer

for dialog in test_set:
    conversations = dialog["conversations"]
    history = None
    for i, conv in enumerate(conversations):
        if conv["from"] == "user":
            # 使用模型生成回答
            response, history = model.chat(tokenizer, conv["value"], history=history)
            # 将回答存储到下一个assistant字段
            if i + 1 < len(conversations) and conversations[i + 1]["from"] == "assistant":
                conversations[i + 1]["value"] = response

# 打印更新后的测试集
for dialog in test_set:
    print(dialog)
# 将更新后的数据集保存到新的JSON文件中
with open('updated_data.json', 'w', encoding='utf-8') as file:
    json.dump(test_set, file, ensure_ascii=False, indent=4)

print("数据已成功更新并保存到 updated_data.json 文件。")

#coding=gb18030
import pandas as pd
import json

# 读取CSV文件
csv_file_path = 'datasets/qwen/sample_IM5000-6000.csv'
df = pd.read_csv(csv_file_path, encoding='ANSI')

# 创建一个空列表来保存转换后的数据
data_list = []

# 遍历DataFrame的每一行
for index, row in df.iterrows():
    # 构造新的行数据
    department = row['department'] + '问诊'
    title = row['title']
    ask = row['ask']
    answer = row['answer']

    # 拼接user的value
    user_value = f"{department}，{title}{ask}"

    # 创建对话结构
    conversations = [
        {"from": "user", "value": user_value},
        {"from": "assistant", "value": answer}
    ]

    # 创建完整条目
    entry = {
        "id": f"identity_{index}",
        "conversations": conversations
    }

    # 添加到列表中
    data_list.append(entry)

# 将数据序列化为JSON格式
json_data = json.dumps(data_list, indent=4, ensure_ascii=False)

# 写入JSON文件
json_file_path = './datasets/qwen/sample_IM5000-6000.json'
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json_file.write(json_data)

print(f"Data has been successfully converted to JSON format and saved to {json_file_path}")