【Python】HuggingFace bert建模流程实例

rejudge

已于 2022-10-18 11:23:08 修改

阅读量725

点赞数 1

分类专栏： Python 文章标签： python bert 开发语言

于 2022-10-12 17:34:46 首次发布

本文链接：https://blog.csdn.net/qq_45249685/article/details/127286612

版权

Python 专栏收录该内容

43 篇文章 3 订阅

订阅专栏

HuggingFace 建模流程实例

文本二分类
文本填空
句子关系

import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''
/kaggle/lib/kaggle/gcp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/dataset_info.json
/kaggle/input/chnsenticorp/ChnSentiCorp/ChnSentiCorp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-train.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-test.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-validation.arrow
/kaggle/working/__notebook_source__.ipynb

'''

文本二分类

from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import torch

1. 定义数据集

# 定义数据集
class Dataset(Dataset):
    def __init__(self, split):
        self.dataset = load_dataset(path='/kaggle/input/chnsenticorp/ChnSentiCorp', split=split)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        text = self.dataset[item]['text']
        label = self.dataset[item]['label']
        
        return text, label

dataset = Dataset('train')

len(dataset), dataset[0]

'''
(9600,
 ('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。
 酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 
 包的早餐是西式的，还算丰富。 服务吗，一般',
  1))
'''

2. 加载tokenizer

# 加载tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer

'''
Downloading: 100%
107k/107k [00:00<00:00, 192kB/s]
Downloading: 100%
29.0/29.0 [00:00<00:00, 922B/s]
Downloading: 100%
624/624 [00:00<00:00, 21.1kB/s]

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', 
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''

3. 定义批处理函数

# 定义批处理函数
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]
    
    # 编码
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents
                                      
                                      ,max_length=500
                                      ,padding='max_length'
                                      ,truncation=True
                                      ,return_tensors='pt' # 返回pytorch类型
                                      ,return_length=True
                                      ) 
    # bert的输入
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    
    return input_ids, attention_mask, token_type_ids, labels

4. DataLoader

# dataloader
loader = DataLoader(dataset=dataset
                   ,batch_size=16
                   ,collate_fn=collate_fn
                   ,shuffle=True
                   ,drop_last=True
                   )

# 取出来一个batch
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape,labels

'''
600
(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1]))
'''

5. 加载bert中文模型

# 加载bert中文模型
from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

out = pretrained(input_ids=input_ids
                ,attention_mask=attention_mask
                ,token_type_ids=token_type_ids
                )
          
out.last_hidden_state.shape # 一个词编码成768维度的向量
'''
torch.Size([16, 500, 768])
'''

6. 定义下游任务模型

# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2) # 2分类
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad(): # 不在反向传播中被记录
            out = pretrained(input_ids=input_ids
                            ,attention_mask=attention_mask
                            ,token_type_ids=token_type_ids
                            )
        #print('out.last_hidden_state.shape',out.last_hidden_state.shape)
        
        # 只需要输出特征的第0个词做分类
        # [cls]用于分类,并且出现在bert输出的第0index
        out = self.fc(out.last_hidden_state[:, 0])
        out = out.softmax(dim=1)
        
        return out

model = Model()

model(input_ids=input_ids
     ,attention_mask=attention_mask
     ,token_type_ids=token_type_ids
     )
'''
tensor([[0.4310, 0.5690],
        [0.4490, 0.5510],
        [0.3135, 0.6865],
        [0.4604, 0.5396],
        [0.5144, 0.4856],
        [0.5671, 0.4329],
        [0.4388, 0.5612],
        [0.6065, 0.3935],
        [0.5771, 0.4229],
        [0.6195, 0.3805],
        [0.4489, 0.5511],
        [0.4380, 0.5620],
        [0.4747, 0.5253],
        [0.3910, 0.6090],
        [0.4087, 0.5913],
        [0.5216, 0.4784]], grad_fn=<SoftmaxBackward0>)
'''

7. 训练

#from transformers import AdamW
from torch.optim import AdamW

# 训练
# https://blog.csdn.net/KGzhang/article/details/77479737
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    out = model(input_ids=input_ids
               ,attention_mask=attention_mask
               ,token_type_ids=token_type_ids
               )
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        
        print(i, loss.item(), accuracy)
    if i == 10:
        break
'''
0 0.7164372801780701 0.4375
5 0.6556262969970703 0.8125
10 0.6778689622879028 0.5
'''

8. 测试

# 测试
def test():
    model.eval()
    correct = 0
    total = 0
    
    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation')
                                             ,batch_size=32
                                             ,collate_fn=collate_fn
                                             ,shuffle=True
                                             ,drop_last=True
                                             )
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        if i == 5:
            break
        print(i)
        
        with torch.no_grad():
            out = model(input_ids=input_ids
                       ,attention_mask=attention_mask
                       ,token_type_ids=token_type_ids
                       )
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)
    print(correct / total)

test()
'''
0
1
2
3
4
0.625
'''

9. 模型保存和加载

保存模型parameter、buffer

path = '/kaggle/working/state_dict_model.pt'

# 没保存优化器的参数
# 保存所有参数和buffer量
# parameter和buffer https://blog.csdn.net/m0_61899108/article/details/124481684
torch.save(model.state_dict(), path)

n1_model = Model() # 实例化模型类
n1_model.load_state_dict(torch.load(path)) # 参数赋给新模型
n1_model.eval() # 将内部training设为False,不再记录参数梯度值,运行效率高

'''
Model(
  (fc): Linear(in_features=768, out_features=2, bias=True)
)
'''

保存整个模型

path = '/kaggle/working/entire_model.pt'

# 保存整个模型
torch.save(model, path)

n2_model = torch.load(path)
n2_model.eval()
'''
Model(
  (fc): Linear(in_features=768, out_features=2, bias=True)
)
'''

10. checkpoint 保存和加载

epoch = 5
loss = 0.4
path = '/kaggle/working/5_0.4_checkpoint.pt'

torch.save({
    'epoch': epoch
    ,'loss': loss
    ,'model_state_dict': model.state_dict()
    ,'optimizer_state_dict': optimizer.state_dict()
    ,
}, path)

# 加载
n3_model = Model()
n3_optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

checkpoint = torch.load(path)
epoch = checkpoint['epoch']
loss = checkpoint['loss']
n3_model.load_state_dict(checkpoint['model_state_dict'])
n3_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

n3_model.eval()
# - or -
n3_model.train()

import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''
/kaggle/lib/kaggle/gcp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/dataset_info.json
/kaggle/input/chnsenticorp/ChnSentiCorp/ChnSentiCorp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-train.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-test.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-validation.arrow
/kaggle/working/state_dict_model.pt
/kaggle/working/__notebook_source__.ipynb
/kaggle/working/5_0.4_checkpoint.pt
/kaggle/working/entire_model.pt
'''

文本填空

import torch
from datasets import load_dataset

# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='/kaggle/input/chnsenticorp/ChnSentiCorp', split=split)
        def f(data):
            return len(data['text']) > 30
        self.dataset = dataset.filter(f)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        text = self.dataset[item]['text']
        return text

dataset = Dataset('train')
len(dataset), dataset[0]
'''
(9192,
 '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。
 酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 
 包的早餐是西式的，还算丰富。 服务吗，一般')
'''

# 加载字典和分词工具
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer
'''
PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', 
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''

# 定义批处理函数
def collate_fn(data):
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=data
                                   
                                  ,max_length=30
                                  ,padding='max_length'
                                  ,truncation=True
                                   
                                  ,return_tensors='pt'
                                  ,return_length=True
                                  )
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    
    # mask掉的字作为label
    labels = input_ids[:, 15].reshape(-1).clone()
    # 第15个字替换为mask
    # input_ids已经是编码后的数字,替换的也需要是[MASk]编码后数字
    input_ids[:, 15] = tokenizer.get_vocab()[tokenizer.mask_token]
    
    return input_ids, attention_mask, token_type_ids, labels

loader = torch.utils.data.DataLoader(dataset=dataset
                                    ,batch_size=16
                                    ,collate_fn=collate_fn
                                    ,shuffle=True
                                    ,drop_last=True
                                    )

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    break
print(len(loader))
print(tokenizer.decode(input_ids[0]))
print(tokenizer.decode(labels[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape
'''
574
[CLS] 看 完 之 后 兴 奋 了 很 久... 因 为 [MASK] 也 处 于 那 种 阶 段... 值 得 学 [SEP]
我
(torch.Size([16, 30]),
 torch.Size([16, 30]),
 torch.Size([16, 30]),
 torch.Size([16]))
 '''

# 加载预训练模型
from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度,把模型参数冻结
for param in pretrained.parameters():
    param.requires_grad_(False)
    
out = pretrained(input_ids=input_ids
                ,attention_mask=attention_mask
                ,token_type_ids=token_type_ids
                )
# out.last_hidden_state 获得输出数据pt
out.last_hidden_state.shape
linear = torch.nn.Linear(768, len(tokenizer.get_vocab()), bias=False)
res = linear(out.last_hidden_state[:,15])
res 
'''
tensor([[ 0.1829, -0.5311, -0.1989,  ..., -0.1905, -0.4505, -0.1490],
        [ 0.3157, -0.8063,  0.0054,  ..., -0.1232, -0.1233, -0.0622],
        [-0.4099, -0.4986,  0.1289,  ..., -0.6015, -0.0355, -0.3704],
        ...,
        [-0.5781, -0.4431, -0.1011,  ..., -0.1154, -0.4494, -0.1650],
        [ 0.2260, -0.3658, -0.2587,  ..., -0.3548, -0.2574,  0.4675],
        [-0.2136, -0.1950,  0.0240,  ..., -0.3630, -0.3018, -0.5322]],
       grad_fn=<MmBackward0>)
'''

# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.decoder = torch.nn.Linear(768, len(tokenizer.get_vocab()), bias=False)
        
        
    def forward(self, input_ids, attention_mask, token_type_ids):
#         out = pretrained(input_ids=input_ids
#                         ,attention_mask=attention_mask
#                         ,token_type_ids=token_type_ids
#                         )
        with torch.no_grad():
            out = pretrained(input_ids=input_ids
                            ,attention_mask=attention_mask
                            ,token_type_ids=token_type_ids
                            )
        out = self.decoder(out.last_hidden_state[:, 15])
        return out
        
model = Model()
model(input_ids=input_ids
     ,attention_mask=attention_mask
     ,token_type_ids=token_type_ids
     ).shape
'''
torch.Size([16, 21128])
'''

# 训练模型
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(1):
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        out = model(input_ids=input_ids
                   ,attention_mask=attention_mask
                   ,token_type_ids=token_type_ids
                   )
        
        # print('out.shape ', out.argmax(dim=1))
        # print('labels.shape', labels.shape)
        loss = criterion(out, labels)
        loss.backward() # 参数的梯度开始计算得出
        optimizer.step() # 优化器算法对模型的参数进行更新
        optimizer.zero_grad() # 每一步训练前模型各参数的梯度置零,否则累积
    
        if i % 50 == 0:
            # max返回的值有两个，values和indexes. argmax只返回indexes
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)

            print(epoch, i, loss.item(), accuracy)
'''
0 0 1.883865237236023 0.625
0 50 2.328688144683838 0.6875
0 100 1.7976388931274414 0.8125
0 150 1.6133203506469727 0.6875
0 200 0.9813346862792969 0.9375
0 250 2.2197675704956055 0.625
0 300 2.4435458183288574 0.6875
0 350 2.027521848678589 0.6875
0 400 1.2056934833526611 0.8125
0 450 2.092010021209717 0.75
0 500 0.865594744682312 0.875
0 550 1.6895039081573486 0.625
'''

def test():
    model.eval()
    correct = 0
    total = 0
    
    loader = torch.utils.data.DataLoader(dataset=Dataset('test')
                                        ,batch_size=32
                                        ,collate_fn=collate_fn
                                        ,shuffle=True
                                        ,drop_last=True
                                        )
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        if i == 15:
            break
        print(i)
        
        # 不会记录梯度函数
        with torch.no_grad():
            out = model(input_ids=input_ids
                       ,attention_mask=attention_mask
                       ,token_type_ids=token_type_ids
                       )
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)
        
    print(correct / total)
        
test()
'''
0
1
2
...
13
14
0.6229166666666667
'''

句子关系

import torch
import random
from datasets import load_dataset

class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='/kaggle/input/chnsenticorp/ChnSentiCorp', split=split)
        
        def f(data):
            return len(data['text']) > 40
        self.dataset = dataset.filter(f)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        text = self.dataset[item]['text']
        
        sentence1 = text[: 20]
        sentence2 = text[20:40]
        label = 0 # 标签0,句子有关
         
        if random.randint(0, 1) == 0:
            idx = random.randint(0, len(self.dataset)-1)
            sentence2 = self.dataset[idx]['text'][20:40]
            label = 1 #标签1,句子无关
            
        return sentence1, sentence2, label

dataset = Dataset('train')
print(len(dataset))
print(dataset[0])
'''
8001
('选择珠江花园的原因就是方便，有电动扶梯直'
, '台，居然说我没有预定，折腾了半天，郁闷，'
, 1)
'''

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer
'''
PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''

print(dataset[0][:2])
print(dataset[0][2])
'''
('选择珠江花园的原因就是方便，有电动扶梯直'
, '接到达海边，周围餐馆、食廊、商场、超市、')
0
'''

def collate_fn(dataset):
    sents = [i[:2] for i in dataset]
    labels = [i[2] for i in dataset]
    
    # 编码的是对一对的句子
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents
                                      
                                      ,max_length=45
                                      ,padding='max_length'
                                      ,truncation=True
                                      ,return_tensors='pt'
                                      ,return_length=True
                                      ,add_special_tokens=True
                                      )
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    
    return input_ids, attention_mask, token_type_ids, labels

loader = torch.utils.data.DataLoader(dataset=dataset
                                    ,batch_size=8
                                    ,collate_fn=collate_fn
                                    ,shuffle=True
                                    ,drop_last=True
                                    )
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    break

print('len(loader)', len(loader))
print(tokenizer.decode(input_ids[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape
'''
len(loader) 1000
[CLS] 这 套 书 也 是 因 为 高 居 榜 首 之 左 右 ， 才 决 定 买 的 [SEP] 。 买 来 之 后 ， 心 情 很 激 动 ， 认 为 这 下 可 以 和 女 [SEP] [PAD] [PAD]
(torch.Size([8, 45]),
 torch.Size([8, 45]),
 torch.Size([8, 45]),
 torch.Size([8]))
'''

from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

for param in pretrained.parameters():
    param.requires_grad_(False)

out = pretrained(input_ids=input_ids
                ,attention_mask=attention_mask
                ,token_type_ids=token_type_ids
                )
out.last_hidden_state.shape
'''
torch.Size([8, 45, 768])
'''

class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear = torch.nn.Linear(768, 2)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        out = pretrained(input_ids=input_ids
                        ,attention_mask=attention_mask
                        ,token_type_ids=token_type_ids
                        )
        
        out = self.linear(out.last_hidden_state[:, 0])
        out = out.softmax(dim=1) # 0-1
        
        return out
        
model = Model()
model(input_ids=input_ids
     ,attention_mask=attention_mask
     ,token_type_ids=token_type_ids
     )
'''
tensor([[0.6049, 0.3951],
        [0.6584, 0.3416],
        [0.6334, 0.3666],
        [0.6860, 0.3140],
        [0.8079, 0.1921],
        [0.6332, 0.3668],
        [0.6906, 0.3094],
        [0.7661, 0.2339]], grad_fn=<SoftmaxBackward0>)
'''

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    out = model(input_ids=input_ids
               ,attention_mask=attention_mask
               ,token_type_ids=token_type_ids
               )
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    # print(model.state_dict())
    
    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        print(i, loss.item(), accuracy)
    if i == 25:
        break
'''
0 0.7084164619445801 0.5
5 0.6241328120231628 0.75
10 0.5478470921516418 0.75
15 0.5879313945770264 0.625
20 0.4713858366012573 1.0
25 0.5577196478843689 0.75
'''

def test():
    model.eval()
    correct = 0
    total = 0
    
    loader = torch.utils.data.DataLoader(dataset=Dataset('test')
                                        ,batch_size=32
                                        ,collate_fn=collate_fn
                                        ,shuffle=True
                                        ,drop_last=True
                                        )
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        if i == 5:
            break
        print(i)
        
        out = model(input_ids=input_ids
                   ,attention_mask=attention_mask
                   ,token_type_ids=token_type_ids
                   )
        out = out.argmax(dim=1)
        
        correct += (out == labels).sum().item()
        total += len(labels)
    print(correct / total)
    
test()
'''
0
1
2
3
4
0.81875
'''