【动手学习pytorch笔记】37.5 BERT微调

BERT微调

import json
import multiprocessing
import os
import torch
from torch import nn
from d2l import torch as d2l

bert.base和bert.small地址

d2l.DATA_HUB['bert.base'] = (d2l.DATA_URL + 'bert.base.torch.zip',
                             '225d66f04cae318b841a13d32af3acc165f253ac')
d2l.DATA_HUB['bert.small'] = (d2l.DATA_URL + 'bert.small.torch.zip',
                              'c72329e68a732bef0452e4b96a1c341c8910f81f')

加载预先训练好的BERT参数

def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens,
                          num_heads, num_layers, dropout, max_len, devices):
    data_dir = d2l.download_extract(pretrained_model)
    # 定义空词表以加载预定义词表
    vocab = d2l.Vocab()
    vocab.idx_to_token = json.load(open(os.path.join(data_dir,'vocab.json')))
    vocab.token_to_idx = {token: idx for idx, token in enumerate(vocab.idx_to_token)}
    bert = d2l.BERTModel(len(vocab), num_hiddens, norm_shape=[256],
                         ffn_num_input=256, ffn_num_hiddens=ffn_num_hiddens,
                         num_heads=4, num_layers=2, dropout=0.2,
                         max_len=max_len, key_size=256, query_size=256,
                         value_size=256, hid_in_features=256,
                         mlm_in_features=256, nsp_in_features=256)
    # 加载预训练BERT参数
    bert.load_state_dict(torch.load(os.path.join(data_dir,'pretrained.params')))
    return bert, vocab

加载和微调经过预训练BERT的小版本(“bert.small”)

devices = d2l.try_all_gpus()
bert, vocab = load_pretrained_model(
    'bert.small', num_hiddens=256, ffn_num_hiddens=512, num_heads=4,
    num_layers=2, dropout=0.1, max_len=512, devices=devices)

微调BERT的数据集

class SNLIBERTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len, vocab=None):
        all_premise_hypothesis_tokens = [[
            p_tokens, h_tokens] for p_tokens, h_tokens in zip(
            *[d2l.tokenize([s.lower() for s in sentences])
              for sentences in dataset[:2]])]

        self.labels = torch.tensor(dataset[2])
        self.vocab = vocab
        self.max_len = max_len
        (self.all_token_ids, self.all_segments,
         self.valid_lens) = self._preprocess(all_premise_hypothesis_tokens)
        print('read ' + str(len(self.all_token_ids)) + ' examples')

    def _preprocess(self, all_premise_hypothesis_tokens):
        #pool = multiprocessing.Pool(4)  # 使用4个进程
        out = map(self._mp_worker, all_premise_hypothesis_tokens)
        out = list(out)
        all_token_ids = [
            token_ids for token_ids, segments, valid_len in out]
        all_segments = [segments for token_ids, segments, valid_len in out]
        valid_lens = [valid_len for token_ids, segments, valid_len in out]
        return (torch.tensor(all_token_ids, dtype=torch.long),
                torch.tensor(all_segments, dtype=torch.long),
                torch.tensor(valid_lens))

    def _mp_worker(self, premise_hypothesis_tokens):
        p_tokens, h_tokens = premise_hypothesis_tokens
        self._truncate_pair_of_tokens(p_tokens, h_tokens)
        tokens, segments = d2l.get_tokens_and_segments(p_tokens, h_tokens)
        token_ids = self.vocab[tokens] + [self.vocab['<pad>']] \
                             * (self.max_len - len(tokens))
        segments = segments + [0] * (self.max_len - len(segments))
        valid_len = len(tokens)
        return token_ids, segments, valid_len

    def _truncate_pair_of_tokens(self, p_tokens, h_tokens):
        # 为BERT输入中的'<CLS>'、'<SEP>'和'<SEP>'词元保留位置
        while len(p_tokens) + len(h_tokens) > self.max_len - 3:
            if len(p_tokens) > len(h_tokens):
                p_tokens.pop()
            else:
                h_tokens.pop()

    def __getitem__(self, idx):
        return (self.all_token_ids[idx], self.all_segments[idx],
                self.valid_lens[idx]), self.labels[idx]

    def __len__(self):
        return len(self.all_token_ids)
# 如果出现显存不足错误,请减少“batch_size”。在原始的BERT模型中,max_len=512
batch_size, max_len = 128, 128
data_dir = "D:\environment\data\data\snli_1.0"
train_set = SNLIBERTDataset(d2l.read_snli(data_dir, True), max_len, vocab)
test_set = SNLIBERTDataset(d2l.read_snli(data_dir, False), max_len, vocab)
train_iter = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size)
read 549367 examples
read 9824 examples

微调BERT

只需要构造一个线性输出层

class BERTClassifier(nn.Module):
    def __init__(self, bert):
        super(BERTClassifier, self).__init__()
        self.encoder = bert.encoder
        self.hidden = bert.hidden
        self.output = nn.Linear(256, 3)

    def forward(self, inputs):
        tokens_X, segments_X, valid_lens_x = inputs
        encoded_X = self.encoder(tokens_X, segments_X, valid_lens_x)
        return self.output(self.hidden(encoded_X[:, 0, :]))

训练

net = BERTClassifier(bert)
lr, num_epochs = 1e-4, 5
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss(reduction='none')
d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs,
    devices)
loss 0.491, train acc 0.805, test acc 0.786
768.3 examples/sec on [device(type='cuda', index=0)]

在这里插入图片描述

训练的很慢很慢…

这一节的采坑:

在构建微调bert的数据集时,李沐老师原代码开了几个线程,

		pool = multiprocessing.Pool(4)  # 使用4个进程
        out = pool.map(self._mp_worker, all_premise_hypothesis_tokens)

在windows上会报错,改成如下

		#pool = multiprocessing.Pool(4)  # 使用4个进程
        out = map(self._mp_worker, all_premise_hypothesis_tokens)
        out = list(out)

不用线程池并把out转换成数组对象。

  • 1
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
具体使用方法可以看我的博客:https://blog.csdn.net/weixin_40015791/article/details/90410083 下面也会简单介绍一下:在bert开源代码中的run_classifier.py中找到 processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "intentdetection":IntentDetectionProcessor, "emotion":EmotionProcessor, #新加上这一行 } 然后在该文件中增加一个class: class EmotionProcessor(DataProcessor): """Processor for the MRPC data set (GLUE version).""" def get_train_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "fine_tuning_train_data.tsv")), "train") #此处的名字和文件夹中的训练集的名字要保持一致 def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "fine_tuning_val_data.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "fine_tuning_test_data.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1","2","3","4","5","6"] #七分类则从0到6 def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": label = "0" text_a = tokenization.convert_to_unicode(line[0]) else: label = tokenization.convert_to_unicode(line[0]) text_a = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples 最后直接调用即可,运行的命令如下: python run_classifier.py \ --task_name=emotion \ --do_train=true \ --do_eval=true \ --data_dir=data \ #把数据解压到同一级的文件夹中,此处是该文件夹名字data --vocab_file=chinese_L-12_H-768_A-12/vocab.txt \ #中文数据要微调的原始bert模型 --bert_config_file=chinese_L-12_H-768_A-12/bert_config.json \ --init_checkpoint=chinese_L-12_H-768_A-12/bert_model.ckpt \ --max_seq_length=128 \ --train_batch_size=32 \ --learning_rate=2e-5 \ --num_train_epochs=3.0 \ --output_dir=output #生成文件所在的文件夹 大概9个小时,最后文件夹中会有三个文件 后缀分别为index/meta/00000-of-00001,分别将这个改成bert_model.ckpt.index/bert_model.ckpt.meta/bert_model.ckpt.data-00000-of-00001,再在同一个文件夹中放入chinese_L-12_H-768_A-12中的vocab.txt和bert_config.json 即最后该文件夹中有5个文件。然后像调用chinese_L-12_H-768_A-12一样将文件夹名改成自己的文件夹名即可。 bert-serving-start -model_dir output -num_worfer=3 即可调用微调后的语言通用模型。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值