openprompt学习笔记 0-basic

最新推荐文章于 2024-04-25 09:36:24 发布

Tcotyledons

最新推荐文章于 2024-04-25 09:36:24 发布

阅读量856

点赞数 4

文章标签：学习 python 机器学习

本文链接：https://blog.csdn.net/weixin_43785343/article/details/129678133

版权

代码在官方github上都有
1，首先安装需要的库

!pip install openprompt
!pip install transformers
!pip install datasets

2，加载数据集，数据集采用的是glue里面句子关系的数据集

# load dataset
from datasets import load_dataset
raw_dataset = load_dataset('super_glue', 'cb', cache_dir="../datasets/.cache/huggingface_datasets")
raw_dataset['train'][0]

输出为：

{'premise': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
 'hypothesis': 'the language was peeled down',
 'idx': 0,
 'label': 0}

3，将原始的数据集修改成openprompt可以使用的格式

from openprompt.data_utils import InputExample

dataset = {}
for split in ['train', 'validation', 'test']: # 同时处理三种数据集（如果有的话）
    dataset[split] = []
    for data in raw_dataset[split]:
        # 理解成给了四个参数后就可以自动生成需要的数据结构，text_a和text_b是固定的名称 
        input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)
print(dataset['train'][0])

输出为：

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "It was a complex language. Not written down but handed down. One might say it was peeled down.",
  "text_b": "the language was peeled down",
  "tgt_text": null
}

4，导入模型这里调用的是T5模型

# You can load the plm related things provided by openprompt simply by calling:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

# Constructing Template
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.
from openprompt.prompts import ManualTemplate
template_text = '{"placeholder":"text_a"} Question: {"placeholder":"text_b"}? Is it correct? {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

# To better understand how does the template wrap the example, we visualize one instance.
# 0表示不算loss，不可被压缩（裁剪）

wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
wrapped_example

输出为：

[[{'text': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': ' Question:', 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': ' the language was peeled down',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': '? Is it correct?', 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0},
  {'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}],
 {'guid': 0, 'label': 0}]

5，tokenize

wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
# or
from openprompt.plms import T5TokenizerWrapper
wrapped_t5tokenizer= T5TokenizerWrapper(max_seq_length=30, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

# You can see what a tokenized example looks like by
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

输出为：
{‘input_ids’: [94, 47, 3, 9, 1561, 1612, 5, 933, 1545, 323, 11860, 10, 8, 1612, 47, 158, 400, 26, 323, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], ‘attention_mask’: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ‘decoder_input_ids’: [0, 32099, 0], ‘loss_ids’: [0, 1, 0]}
[‘▁It’, ‘▁was’, ‘▁’, ‘a’, ‘▁complex’, ‘▁language’, ‘.’, ‘▁Not’, ‘▁written’, ‘▁down’, ‘▁Question’, ‘:’, ‘▁the’, ‘▁language’, ‘▁was’, ‘▁pe’, ‘ele’, ‘d’, ‘▁down’, ‘▁’, ‘?’, ‘▁I’, ‘s’, ‘▁it’, ‘▁correct’, ‘?’, ‘<extra_id_0>’, ‘▁’, ‘.’, ‘’]
[‘’, ‘<extra_id_0>’, ‘’]
6，对所有的训练数据进行tokenizer操作warp，这里要先Template后才能wraptokenizer

{'input_ids': [94,
  47,
  3,
  9,
  1561,
  1612,
  5,
  933,
  1545,
  323,
  11860,
  10,
  8,
  1612,
  47,
  158,
  400,
  26,
  323,
  3,
  58,
  27,
  7,
  34,
  2024,
  58,
  32099,
  3,
  5,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'decoder_input_ids': [0, 32099, 0],
 'loss_ids': [0, 1, 0]}

7，dataloader读入数据

# We provide a `PromptDataLoader` class to help you do all the above matters and wrap them into an `torch.DataLoader` style iterator.
from openprompt import PromptDataLoader
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=30, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")
next(iter(train_dataloader))

输出为：

tokenizing: 250it [00:00, 747.35it/s]
{"input_ids": [[27, 1869, 131, 147, 11860, 10, 31014, 141, 3, 9, 1338, 28, 112, 1365, 6979, 11, 12638, 530, 12744, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], [7005, 9, 11338, 24, 132, 398, 36, 1995, 13, 712, 1076, 11860, 10, 7005, 9, 47, 352, 550, 2238, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], [27, 54, 31, 17, 5293, 12, 129, 3005, 11918, 323, 11860, 10, 8, 192, 3567, 141, 321, 141, 631, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1], [11860, 10, 255, 141, 801, 1363, 5, 11, 8667, 5, 3038, 2796, 32, 26, 21, 167, 13, 70, 1342, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1]], "inputs_embeds": null, "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], "token_type_ids": null, "label": [0, 0, 2, 0], "decoder_input_ids": [[0, 32099, 0], [0, 32099, 0], [0, 32099, 0], [0, 32099, 0]], "decoder_inputs_embeds": null, "soft_token_ids": null, "past_key_values": null, "loss_ids": [[0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0]], "guid": [113, 31, 86, 115], "tgt_text": null, "encoded_tgt_text": null, "input_ids_len": null}

8，定义verbalizer

# Define the verbalizer
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:

from openprompt.prompts import ManualVerbalizer
import torch

# for example the verbalizer contains multiple label words in each class
myverbalizer = ManualVerbalizer(tokenizer, num_classes=3,
                        label_words=[["yes"], ["no"], ["maybe"]])

print(myverbalizer.label_words_ids)
#这里假设它输出是[batch，len词汇表]
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do

输出为：

Parameter containing:
tensor([[[4273]],

        [[ 150]],

        [[2087]]])
tensor([[-2.7433, -0.4004, -1.3257],
        [-0.1777, -1.9210, -4.1158]])

其中第一个tensor是label_word的标签，第二个每个batch在三个label_word上的logit
9,定义分类的模型

# pipeline 分类任务
from openprompt import PromptForClassification

use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)#参数分别是模型，template和verbalizer
if use_cuda:
    prompt_model = prompt_model.cuda()

10，训练

# Now the training is standard
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
# any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False，则返回 False，如果有一个为 True，则返回 True。
# 这里的意思是不需要给bias和LayerNorm.weight权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

输出如下（不想等按了暂停）：

Epoch 0, average loss: 1.1108011603355408
Epoch 1, average loss: 0.3556751012802124
Epoch 2, average loss: 0.16584844887256622
Epoch 3, average loss: 0.01751189772039652
Epoch 4, average loss: 0.002058575744740665
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-18-70654314ef30> in <module>
     23         tot_loss += loss.item()
     24         optimizer.step()
---> 25         optimizer.zero_grad()
     26         if step %100 ==1:
     27             print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

/usr/local/lib/python3.9/dist-packages/torch/optim/optimizer.py in zero_grad(self, set_to_none)
    277                                 p.grad.requires_grad_(False)
    278                             if (not foreach or p.grad.is_sparse):
--> 279                                 p.grad.zero_()
    280                             else:
    281                                 per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(p.grad)

KeyboardInterrupt:

11，预测

# Evaluate
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())  # tolist可以直接将tensor转换为list列表
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)

输出为:

tokenizing: 56it [00:00, 302.53it/s]
0.6964285714285714

写这博客只是为了记录学习的过程，详细在https://github.com/thunlp/OpenPrompt

Tcotyledons

关注

4
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
openprompt学习笔记 0-basic

6，对所有的训练数据进行tokenizer操作warp，这里要先Template后才能wraptokenizer。其中第一个tensor是label_word的标签，第二个每个batch在三个label_word上的logit。写这博客只是为了记录学习的过程，详细在https://github.com/thunlp/OpenPrompt。2，加载数据集，数据集采用的是glue里面句子关系的数据集。3，将原始的数据集修改成openprompt可以使用的格式。4，导入模型这里调用的是T5模型。
复制链接

扫一扫