【自然语言处理】【Pytorch】使用领域语料继续预训练BERT

该博客介绍了如何利用transformers库在特定语料上对BERT进行继续预训练。首先,设置了参数并加载IMDb数据集。接着,加载预训练的BERT模型和tokenizer。然后,定义了Dataset和DataLoader,并创建了一个自定义的Trainer类,包含训练方法和数据处理。最后,进行了多GPU训练,并保存了训练好的模型。
摘要由CSDN通过智能技术生成
import pandas as pd
import os
import random
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AdamW, BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, List
from torch.nn.utils.rnn import pad_sequence

一、Transformers

transformers是开源的预训练语言模型包,https://github.com/huggingface/transformers。其能够提供许多用于文本分类、信息抽取、问答、摘要、翻译等任务的预训练语言模型。本文使用transformers中的BERT来实现MLM预训练任务在特定语料上的继续预训练。

二、设置参数

base = "./Data/imdb"
pretrain_model = "bert-base-uncased"
max_length = 512
epochs = 3
seed = 900

random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

三、读取数据并进行处理

train = pd.read_csv(os.path.join(base,"labeledTrainData.tsv"), header=0,delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(base,"testData.tsv"),header=0,delimiter="\t", quoting=3)
train['review'] = train['review'].apply(lambda r:r.strip("\""))
test['review'] = test['review'].apply(lambda r:r.strip("\""))
examples = list(train['review'])+list(test['review'])

四、读取预训练模型和tokenizer

tokenizer = BertTokenizer.from_pretrained(pretrain_model)
model = BertForMaskedLM.from_pretrained(pretrain_model)

五、定义Dataset

class LineByLineTextDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.examples = tokenizer.batch_encode_plus(examples, add_special_tokens=True,
                                                    max_length=max_length, truncation=True)["input_ids"]
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)
    
dataset = LineByLineTextDataset(examples, tokenizer, max_length)
print(" ".join(tokenizer.convert_ids_to_tokens(dataset[5])))

输出:

[CLS] i don ##t know why people think this is such a bad movie . its got a pretty good plot , some good action , and the change of location for harry does not hurt either . sure some of its offensive and gr ##at ##uit ##ous but this is not the only movie like that . eastwood is in good form as dirty harry , and i liked pat hi ##ng ##le in this movie as the small town cop . if you liked dirty harry , then you should see this one , its a lot better than the dead pool . 4 / 5 [SEP]

六、定义DataLoader

def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

dataloader = DataLoader(dataset, shuffle=True, batch_size=8, collate_fn=collate)

七、定义Trainer

class Trainer:
    def __init__(self, model, dataloader, tokenizer, mlm_probability=0.15, lr=1e-4, with_cuda=True, cuda_devices=None, log_freq=100):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model
        self.is_parallel = False
        self.dataloader = dataloader
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability
        self.log_freq = log_freq
        
        # 多GPU训练
        if with_cuda and torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUS for BERT")
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
            self.is_parallel = True
        self.model.train()
        self.model.to(self.device)
        self.optim = AdamW(self.model.parameters(), lr=1e-4)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
        
    def train(self, epoch):
        self.iteration(epoch, self.dataloader)
        
    def iteration(self, epoch, dataloader, train=True):
        str_code = 'Train'
        total_loss = 0.0
        for i,batch in tqdm(enumerate(dataloader), desc="Training"):
            inputs, labels = self._mask_tokens(batch)
            inputs.to(self.device)
            labels.to(self.device)
            lm_loss,output = self.model(inputs, masked_lm_labels=labels)
            loss = lm_loss.mean()
            
            if train:
                self.model.zero_grad()
                self.optim.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optim.step()
                
            total_loss += loss.item()
            post_fix = {
                "iter": i,
                "ave_loss": total_loss/(i+1)
            }
            if i % self.log_freq == 0:
                print(post_fix)
                
        print(f"EP{epoch}_{str_code},avg_loss={total_loss/len(dataloader)}")
        
    def _mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """ Masked Language Model """
        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
            )
            
        labels = inputs.clone()
        # 使用mlm_probability填充张量
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        # 获取special token掩码
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        # 将special token位置的概率填充为0
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if self.tokenizer._pad_token is not None:
            # padding掩码
            padding_mask = labels.eq(tokenizer.pad_token_id)
            # 将padding位置的概率填充为0
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        
        # 对token进行mask采样
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # loss只计算masked
        
        # 80%的概率将masked token替换为[MASK]
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        
        # 10%的概率将masked token替换为随机单词
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        
        # 余下的10%不做改变
        return inputs, labels
    
trainer = Trainer(model, dataloader, tokenizer)

输出:

Using 2 GPUS for BERT
Total Parameters: 109514298

八、进行训练

for epoch in range(epochs):
    trainer.train(epoch)

输出:

{'iter': 0, 'ave_loss': 2.836127758026123}
{'iter': 100, 'ave_loss': 2.370666817863389}
{'iter': 200, 'ave_loss': 2.3373566693927517}
...
{'iter': 6200, 'ave_loss': 2.2395244782787236}
EP0_Train,avg_loss=2.239439134082794

九、保存模型

model.save_pretrained(".")
评论 17
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

BQW_

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值