快速总结
1. 语言模型^(-1/m)
2. 可以表示成交叉熵的形式:log perplexity和交叉熵是等价的。
详细推导
From 求通俗解释NLP里的perplexity是什么? - 知乎
--------------------
使用Bert计算ppl
logsoftmax之后不断累加,然后exp一把
from transformers import BertTokenizer, BertForMaskedLM
import torch
def bert_ppl_without_numpy(sentence, model_path='bert-base-uncased'):
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)
model.eval()
tokens = tokenizer.tokenize(sentence)
input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])
total_log_prob = 0.0
for i in range(len(tokens)):
masked_tokens = tokens.copy()
masked_tokens[i] = tokenizer.mask_token
masked_input = torch.tensor([tokenizer.convert_tokens_to_ids(masked_tokens)])
with torch.no_grad():
outputs = model(masked_input)
logits = outputs.logits[0, i]
log_prob = torch.log_softmax(logits, dim=0)[input_ids[0, i]]
total_log_prob += log_prob.item()
avg_log_prob = total_log_prob / len(tokens)
ppl = torch.exp(torch.tensor(-avg_log_prob)).item()
return ppl
bert_ppl_without_numpy("I am a student")
使用gpt2计算ppl
torch.exp(loss)即可
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
def gpt2_ppl_without_numpy(sentence, model_path='gpt2'):
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()
input_ids = tokenizer.encode(sentence, return_tensors='pt')
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
ppl = torch.exp(outputs.loss).item()
return ppl