transformers bert

最新推荐文章于 2024-05-24 17:52:00 发布

Claroja

最新推荐文章于 2024-05-24 17:52:00 发布

阅读量187

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/claroja/article/details/108471588

版权

Python 专栏收录该内容

398 篇文章 5 订阅

订阅专栏

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as tfs
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('./data/train.tsv', delimiter='\t', header=None)

train_set = train_df[:3000]   #取其中的3000条数据作为我们的数据集
print(train_set.shape)
train_set[1].value_counts()   #查看数据集中标签的分布

model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

#add_special_tokens 表示在句子的首尾添加[CLS]和[END]符号
train_tokenized = train_set[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

# 添加padding将句子处理成相同的长度
train_max_len = 0
for i in train_tokenized.values:
    if len(i) > train_max_len:
        train_max_len = len(i)

train_padded = np.array([i + [0] * (train_max_len-len(i)) for i in train_tokenized.values])
print(train_padded.shape)
# 增加mask序列，0为不需要处理
train_attention_mask = np.where(train_padded != 0, 1, 0)

# 提取特征
train_input_ids = torch.tensor(train_padded).long()
train_attention_mask = torch.tensor(train_attention_mask).long()
with torch.no_grad():
    train_last_hidden_states = model(train_input_ids, attention_mask=train_attention_mask)

train_last_hidden_states[0].size() # 样本数量；序列长度；特征数量
train_features = train_last_hidden_states[0][:,0,:].numpy() # 我们使用[:,0,:]来提取序列第一个位置的输出向量，因为第一个位置是[CLS]，比起其他位置，该向量应该更具有代表性
train_labels = train_set[1]

train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels)

lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)


#part 2 - bert fine-tuned
import torch
from torch import nn
from torch import optim
import transformers as tfs
import math

class BertClassificationModel(nn.Module):
    def __init__(self):
        super(BertClassificationModel, self).__init__()   
        model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-uncased')         
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.bert = model_class.from_pretrained(pretrained_weights)
        self.dense = nn.Linear(768, 2)  #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
        
    def forward(self, batch_sentences):
        batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
                                max_len=66, pad_to_max_length=True)      #tokenize、add special token、pad
        input_ids = torch.tensor(batch_tokenized['input_ids'])
        attention_mask = torch.tensor(batch_tokenized['attention_mask'])
        bert_output = self.bert(input_ids, attention_mask=attention_mask)
        bert_cls_hidden_state = bert_output[0][:,0,:]       #提取[CLS]对应的隐藏状态
        linear_output = self.dense(bert_cls_hidden_state)
        return linear_output

sentences = train_set[0].values
targets = train_set[1].values
train_inputs, test_inputs, train_targets, test_targets = train_test_split(sentences, targets)

batch_size = 64
batch_count = int(len(train_inputs) / batch_size)
batch_train_inputs, batch_train_targets = [], []
for i in range(batch_count):
    batch_train_inputs.append(train_inputs[i*batch_size : (i+1)*batch_size])
    batch_train_targets.append(train_targets[i*batch_size : (i+1)*batch_size])

#train the model
epochs = 3
lr = 0.01
print_every_batch = 5
bert_classifier_model = BertClassificationModel()
optimizer = optim.SGD(bert_classifier_model.parameters(), lr=lr, momentum=0.9)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    print_avg_loss = 0
    for i in range(batch_count):
        inputs = batch_train_inputs[i]
        labels = torch.tensor(batch_train_targets[i])
        optimizer.zero_grad()
        outputs = bert_classifier_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        print_avg_loss += loss.item()
        if i % print_every_batch == (print_every_batch-1):
            print("Batch: %d, Loss: %.4f" % ((i+1), print_avg_loss/print_every_batch))
            print_avg_loss = 0

# eval the trained model
total = len(test_inputs)
hit = 0
with torch.no_grad():
    for i in range(total):
        outputs = bert_classifier_model([test_inputs[i]])
        _, predicted = torch.max(outputs, 1)
        if predicted == test_targets[i]:
            hit += 1

print("Accuracy: %.2f%%" % (hit / total * 100))