代码在官方github上都有
1,首先安装需要的库
!pip install openprompt
!pip install transformers
!pip install datasets
2,加载数据集,数据集采用的是glue里面句子关系的数据集
# load dataset
from datasets import load_dataset
raw_dataset = load_dataset('super_glue', 'cb', cache_dir="../datasets/.cache/huggingface_datasets")
raw_dataset['train'][0]
输出为:
{'premise': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
'hypothesis': 'the language was peeled down',
'idx': 0,
'label': 0}
3,将原始的数据集修改成openprompt可以使用的格式
from openprompt.data_utils import InputExample
dataset = {}
for split in ['train', 'validation', 'test']: # 同时处理三种数据集(如果有的话)
dataset[split] = []
for data in raw_dataset[split]:
# 理解成给了四个参数后就可以自动生成需要的数据结构,text_a和text_b是固定的名称
input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx'])
dataset[split].append(input_example)
print(dataset['train'][0])
输出为:
{
"guid": 0,
"label": 0,
"meta": {},
"text_a": "It was a complex language. Not written down but handed down. One might say it was peeled down.",
"text_b": "the language was peeled down",
"tgt_text": null
}
4,导入模型这里调用的是T5模型
# You can load the plm related things provided by openprompt simply by calling:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")
# Constructing Template
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.
from openprompt.prompts import ManualTemplate
template_text = '{"placeholder":"text_a"} Question: {"placeholder":"text_b"}? Is it correct? {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)
# To better understand how does the template wrap the example, we visualize one instance.
# 0表示不算loss,不可被压缩(裁剪)
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
wrapped_example
输出为:
[[{'text': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
'loss_ids': 0,
'shortenable_ids': 1},
{'text': ' Question:', 'loss_ids': 0, 'shortenable_ids': 0},
{'text': ' the language was peeled down',
'loss_ids': 0,
'shortenable_ids': 1},
{'text': '? Is it correct?', 'loss_ids': 0, 'shortenable_ids': 0},
{'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0},
{'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}],
{'guid': 0, 'label': 0}]
5,tokenize
wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
# or
from openprompt.plms import T5TokenizerWrapper
wrapped_t5tokenizer= T5TokenizerWrapper(max_seq_length=30, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
# You can see what a tokenized example looks like by
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))
输出为:
{‘input_ids’: [94, 47, 3, 9, 1561, 1612, 5, 933, 1545, 323, 11860, 10, 8, 1612, 47, 158, 400, 26, 323, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], ‘attention_mask’: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ‘decoder_input_ids’: [0, 32099, 0], ‘loss_ids’: [0, 1, 0]}
[‘▁It’, ‘▁was’, ‘▁’, ‘a’, ‘▁complex’, ‘▁language’, ‘.’, ‘▁Not’, ‘▁written’, ‘▁down’, ‘▁Question’, ‘:’, ‘▁the’, ‘▁language’, ‘▁was’, ‘▁pe’, ‘ele’, ‘d’, ‘▁down’, ‘▁’, ‘?’, ‘▁I’, ‘s’, ‘▁it’, ‘▁correct’, ‘?’, ‘<extra_id_0>’, ‘▁’, ‘.’, ‘’]
[‘’, ‘<extra_id_0>’, ‘’]
6,对所有的训练数据进行tokenizer操作warp,这里要先Template后才能wraptokenizer
{'input_ids': [94,
47,
3,
9,
1561,
1612,
5,
933,
1545,
323,
11860,
10,
8,
1612,
47,
158,
400,
26,
323,
3,
58,
27,
7,
34,
2024,
58,
32099,
3,
5,
1],
'attention_mask': [1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1],
'decoder_input_ids': [0, 32099, 0],
'loss_ids': [0, 1, 0]}
7,dataloader读入数据
# We provide a `PromptDataLoader` class to help you do all the above matters and wrap them into an `torch.DataLoader` style iterator.
from openprompt import PromptDataLoader
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
tokenizer_wrapper_class=WrapperClass, max_seq_length=30, decoder_max_length=3,
batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
truncate_method="head")
next(iter(train_dataloader))
输出为:
tokenizing: 250it [00:00, 747.35it/s]
{"input_ids": [[27, 1869, 131, 147, 11860, 10, 31014, 141, 3, 9, 1338, 28, 112, 1365, 6979, 11, 12638, 530, 12744, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], [7005, 9, 11338, 24, 132, 398, 36, 1995, 13, 712, 1076, 11860, 10, 7005, 9, 47, 352, 550, 2238, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], [27, 54, 31, 17, 5293, 12, 129, 3005, 11918, 323, 11860, 10, 8, 192, 3567, 141, 321, 141, 631, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], [11860, 10, 255, 141, 801, 1363, 5, 11, 8667, 5, 3038, 2796, 32, 26, 21, 167, 13, 70, 1342, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1]], "inputs_embeds": null, "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], "token_type_ids": null, "label": [0, 0, 2, 0], "decoder_input_ids": [[0, 32099, 0], [0, 32099, 0], [0, 32099, 0], [0, 32099, 0]], "decoder_inputs_embeds": null, "soft_token_ids": null, "past_key_values": null, "loss_ids": [[0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0]], "guid": [113, 31, 86, 115], "tgt_text": null, "encoded_tgt_text": null, "input_ids_len": null}
8,定义verbalizer
# Define the verbalizer
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:
from openprompt.prompts import ManualVerbalizer
import torch
# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=3,
label_words=[["yes"], ["no"], ["maybe"]])
print(myverbalizer.label_words_ids)
#这里假设它输出是[batch,len词汇表]
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do
输出为:
Parameter containing:
tensor([[[4273]],
[[ 150]],
[[2087]]])
tensor([[-2.7433, -0.4004, -1.3257],
[-0.1777, -1.9210, -4.1158]])
其中第一个tensor是label_word的标签,第二个每个batch在三个label_word上的logit
9,定义分类的模型
# pipeline 分类任务
from openprompt import PromptForClassification
use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)#参数分别是模型,template和verbalizer
if use_cuda:
prompt_model = prompt_model.cuda()
10,训练
# Now the training is standard
from transformers import AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
# any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False,则返回 False,如果有一个为 True,则返回 True。
# 这里的意思是不需要给bias和LayerNorm.weight权重衰减
optimizer_grouped_parameters = [
{'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
for epoch in range(10):
tot_loss = 0
for step, inputs in enumerate(train_dataloader):
if use_cuda:
inputs = inputs.cuda()
logits = prompt_model(inputs)
labels = inputs['label']
loss = loss_func(logits, labels)
loss.backward()
tot_loss += loss.item()
optimizer.step()
optimizer.zero_grad()
if step %100 ==1:
print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)
输出如下(不想等按了暂停):
Epoch 0, average loss: 1.1108011603355408
Epoch 1, average loss: 0.3556751012802124
Epoch 2, average loss: 0.16584844887256622
Epoch 3, average loss: 0.01751189772039652
Epoch 4, average loss: 0.002058575744740665
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-18-70654314ef30> in <module>
23 tot_loss += loss.item()
24 optimizer.step()
---> 25 optimizer.zero_grad()
26 if step %100 ==1:
27 print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)
/usr/local/lib/python3.9/dist-packages/torch/optim/optimizer.py in zero_grad(self, set_to_none)
277 p.grad.requires_grad_(False)
278 if (not foreach or p.grad.is_sparse):
--> 279 p.grad.zero_()
280 else:
281 per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(p.grad)
KeyboardInterrupt:
11,预测
# Evaluate
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
truncate_method="head")
allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
if use_cuda:
inputs = inputs.cuda()
logits = prompt_model(inputs)
labels = inputs['label']
alllabels.extend(labels.cpu().tolist()) # tolist可以直接将tensor转换为list列表
allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)
输出为:
tokenizing: 56it [00:00, 302.53it/s]
0.6964285714285714
写这博客只是为了记录学习的过程,详细在https://github.com/thunlp/OpenPrompt