# 使用BERT和GPT-2计算句子困惑度PPL

8 篇文章 4 订阅
7 篇文章 0 订阅

### BERT

Chinese-BERT-wwm

import numpy as np
import torch
import torch.nn as nn
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
sentence = "我不会忘记和你一起奋斗的时光。"
tokenize_input = tokenizer.tokenize(sentence)
tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
sen_len = len(tokenize_input)
sentence_loss = 0.

for i, word in enumerate(tokenize_input):

prediction_scores = output[0]
softmax = nn.Softmax(dim=0)
ps = softmax(prediction_scores[0, i]).log()
word_loss = ps[tensor_input[0, i]]
sentence_loss += word_loss.item()

tokenize_input[i] = word
ppl = np.exp(-sentence_loss/sen_len)
print(ppl)

tensor思维的写法：

tensor_input = tokenizer.encode(sentence, return_tensors='pt')
repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
result = np.exp(loss.item())
return result

s = score(model, tokenizer, '我不会忘记和你一起奋斗的时光。')
print(s)

### GPT-2

GPT2-Chinese

import torch
from torch.nn import CrossEntropyLoss

def cal_ppl_bygpt2():
sens = ["今天是个好日子。", "天今子日。个是好", "这个婴儿有900000克呢。", "我不会忘记和你一起奋斗的时光。",
"我不会记忘和你一起奋斗的时光。", "会我记忘和你斗起一奋的时光。"]
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
inputs = tokenizer(sens, padding='max_length', max_length=50, truncation=True, return_tensors="pt")
bs, sl = inputs['input_ids'].size()
outputs = model(**inputs, labels=inputs['input_ids'])
logits = outputs[1]
# Shift so that tokens < n predict n
shift_logits = logits[:, :-1, :].contiguous()
shift_labels = inputs['input_ids'][:, 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=0, reduction="none")
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach().reshape(bs, -1)
meanloss = loss.sum(1) / shift_attentions.sum(1)
ppl = torch.exp(meanloss).numpy().tolist()
return ppl

if __name__ == '__main__':
cal_ppl_bygpt2()
• 11
点赞
• 26
收藏
觉得还不错? 一键收藏
• 12
评论
01-24 424
08-03
04-18 477
08-03
10-18 526
07-26 6723
07-31 4340
08-27 1767
12-17 4853
05-17 599
01-23 561
12-13 8587

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。