1、加载微调后的模型:
# 导入必要的库
import json
import torch
from transformers import AutoTokenizer, AutoModel,AutoConfig
import re
import os
# 加载模型和分词器
# tokenizer = AutoTokenizer.from_pretrained("", trust_remote_code=True)
# model = AutoModel.from_pretrained("", trust_remote_code=True).quantize(8).half().cuda()
# model = model.eval()
tokenizer = AutoTokenizer.from_pretrained("", trust_remote_code=True)
# 加载预训练的语言模型
config = AutoConfig.from_pretrained("", trust_remote_code=True, pre_seq_len=128)
model = AutoModel.from_pretrained("", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join("", "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
model = model.quantize(4)
model = model.half().cuda()
model.transformer.prefix_encoder.float()
model = model.eval()
2、读取json文件
# 读取JSON文件
with open(r'ChatGLM-6B-main\ChatGLM-6B-main\ptuning\EmpatheticDialogues\test_zh.json', 'r') as file:
data = file.read()
# 修复JSON数据格式
fixed_data = re.sub(r'}\s*{', '},{', data)
# 解析JSON数据
json_data = json.loads('[' + fixed_data + ']')
3、计算PPL
根据输入去预测输出的第一个token的概率向量,然后把真实标签的第一个token添加到末尾作为输入,去预测输出的第二个token的概率向量。
import torch.nn.functional as F
import numpy as np
from tqdm.notebook import tqdm
from torch import nn
# 定义计算PPL的函数
def calculate_loss(context, target):
# 将context和target转换为输入id和目标id
input_ids = tokenizer.encode(context, return_tensors="pt").to('cuda')
target_ids = tokenizer.encode(target, return_tensors="pt").to('cuda')
target_length = target_ids.size(1)
# 用模型预测输出
output_ids = []
logits_list = []
probabilities_list = []
#利用for循环将target和output对齐
for i in range(len(target_ids[0])):
#output是根据输入[0,N]得到的句子[1,N+1]的概率向量
output = model(input_ids)
logits = output.logits[0]
#logits为output句子中,第一个词的概率向量 logits.shape:torch.Size([37,130528])、torch.Size([38,130528])
probabilities = logits.softmax(dim=-1)
next_token_id = logits.argmax(dim=-1)[-1]
output_ids.append(next_token_id.cpu().detach().numpy())
#logits[-1],logits的最后一维,即句子[1,N+1]中的N+1
logits_list.append(logits[-1].cpu().detach().numpy())
probabilities_list.append(probabilities[-1].cpu().detach().numpy())
#把第一个token的target添加到输入中,预测第二个token。
input_ids = torch.cat([input_ids, target_ids[0,i][None,None]], dim=-1)
text = tokenizer.decode(output_ids)
#print("Output:",text)
#print("Target:",target)
logits=torch.Tensor(np.array(logits_list,dtype="float32")).to("cuda")
logits=torch.nn.functional.softmax(logits,dim=-1)
y_true=F.one_hot(target_ids,num_classes=130528)
y_pred=logits[None]
#print(y_true.shape,y_pred.shape)
loss=-torch.sum(y_true*torch.log(y_pred+1e-5),dim=-1)
loss=torch.mean(loss)
#print("Loss",loss)
return loss.cpu().detach().numpy()
total_loss = 0
length=10
for i,item in enumerate(tqdm(json_data)):
#print(f"---------------iter:{i+1:04d}-------------------")
context = item["dialogue"]
target = item["target"]
# print("context: ",context)
# print("target: ",target)
loss = calculate_loss(context,target)
total_loss += loss
avg_loss = total_loss/(i+1)
# ppl = torch.exp(torch.Tensor([avg_loss]))
ppl = torch.exp(torch.tensor(avg_loss).float()).cpu().detach().numpy()
# 更新进度条下方的输出
tqdm.write(f"当前平均Loss: {avg_loss}, 当前PPL: {ppl}",end="\r")
4、计算Distinct
import torch.nn.functional as F
import numpy as np
from tqdm.notebook import tqdm
from torch import nn
def distinct_n(tokens, n):
#将 tokens 列表中的元素按照长度为 n 的窗口进行切片
ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
distinct_ngrams = set(ngrams)
return len(distinct_ngrams) / len(ngrams)
def calculate_distinct(data, n):
#遍历 data 列表的每个元素,将其进行分词,并将分词结果添加到 all_tokens 列表中
all_tokens = []
for entry in data:
tokens = tokenizer.tokenize(entry)
all_tokens.extend(tokens)
distinct = distinct_n(all_tokens, n)
return distinct
dis_1_sum = 0
dis_2_sum = 0
for i,item in enumerate(tqdm(json_data)):
#print(f"---------------iter:{i+1:04d}-------------------")
context = item["diologue"]
target = item["target"]
# print("context: ",context)
# print("target: ",target)
response,history = model.chat(tokenizer,context,history=[])
distinct_1 = calculate_distinct(response, 1)
distinct_2 = calculate_distinct(response, 2)
dis_1_sum += distinct_1
dis_2_sum += distinct_2
# 更新进度条下方的输出
if i==0:
continue
tqdm.write(f": distinct_1: {dis_1_sum/i}, distinct_2: {dis_2_sum/i}",end="\r")