【bert训练自用】

最新推荐文章于 2024-05-16 15:38:54 发布
Today_history
最新推荐文章于 2024-05-16 15:38:54 发布
阅读量456
点赞数
文章标签： bert python
本文链接：https://blog.csdn.net/Today_history/article/details/127874350
版权
bert训练自用
 
#%% 导入包
from transformers import BertTokenizer,BertModel,BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.utils import shuffle
from plt_score import plt_roc_pr

#%% 导入数据
import os,sys
# print('current path:',os.getcwd())
os.chdir(sys.path[0]) 
# print('current path:',os.getcwd())
# df = pd.read_excel('../../data/label_data_language_en.xlsx')
df = pd.read_excel('../../data/label_data_language_en_aug.xlsx')#读取数据增强文件
print(df['gf_review_fix'].value_counts())
df['content'] = df['content'].str.lower().fillna('test') #全部小写
print(df.isnull().sum())
df = shuffle(df,random_state=1115)#shuffle两次
df = shuffle(df,random_state=930)
#%% 文本预处理

start_token=time.time()
x = list(df['content']) 
y = list(df['gf_review_fix'])
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.2,random_state=1634,stratify = y)
x_vali,x_test,y_vali,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=1111,stratify = y_test)

tokenizer = BertTokenizer.from_pretrained('../../model-pytorch/bert-base-uncased')
# print(tokenizer)
train_encoding = tokenizer(x_train, add_special_tokens=True,
                                padding=True, truncation=True, max_length = 300,return_tensors="pt" )
vali_encoding = tokenizer(x_vali, add_special_tokens=True,
                                padding=True, truncation=True, max_length = 300,return_tensors="pt" )
test_encoding = tokenizer(x_test, add_special_tokens=True,
                                padding=True, truncation=True, max_length = 300,return_tensors="pt")



# print(type(train_encoding))

#%%定义类
# print(x)
class NewsDataset(Dataset):

    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels


    #这里的idx是为了让后面的DataLoader成批处理成迭代器，按idx映射到对应数据
    def __getitem__(self, idx):
        item = {key:(val[idx]) for key, val in self.encodings.items()}
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)


train_dataset = NewsDataset(train_encoding, y_train)
vali_dataset = NewsDataset(vali_encoding, y_vali)
test_dataset = NewsDataset(test_encoding, y_test)

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True)
vali_loader = DataLoader(vali_dataset,batch_size=batch_size ,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size , shuffle=True)


end_token = time.time()
print('data token batch Running time:{} Seconds'.format(end_token-start_token))
#可以看看长啥样
# batch = next(iter(train_loader))
# print(batch)
# print(batch['input_ids'].shape)


# %% 构建模型

torch.cuda.empty_cache()
device = torch.device("cuda:0"if torch.cuda.is_available() else"cpu")

class BertClassificationModel(nn.Module):
    def __init__(self):
        super(BertClassificationModel, self).__init__()   
        model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer, 
                                                            "../../model-pytorch/bert-base-uncased/")         
        # self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        #bert，input，dense都要放入GPU中
        #从上次的第**次开始再训练
        # pretrained_weights = '../model_save/uncase_bert_stratify_19'
        self.bert = torch.nn.DataParallel(model_class.from_pretrained(pretrained_weights).cuda())
        # 最后的预测层
        self.predictor = nn.Sequential(
            nn.Linear(768, 2).cuda(), #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
            nn.Softmax(dim=1)
        )

    def forward(self, input_ids,attention_mask):
        torch.cuda.empty_cache()
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        bert_output = self.bert( input_ids= input_ids, attention_mask=attention_mask)
        bert_cls_hidden_state = bert_output[0][:,0,:]       #提取[CLS]对应的隐藏状态
        linear_sigmoid_output = self.predictor(bert_cls_hidden_state)
        return linear_sigmoid_output


#%% 实例化模型、定义损失函数和优化器、epochs


#初始化
torch.cuda.empty_cache()
bert_classifier_model = BertClassificationModel()

#超参数-----------------
#轮次，学习率
epochs = 10
lr = 1e-4

#优化器、调整器、损失函数
optimizer = AdamW(bert_classifier_model.parameters(), lr=lr) #改为Adam ->AdamW,删除筛选过滤冻结层(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
total_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
criterion =nn.CrossEntropyLoss().cuda()


#记录数据参数-----------------
#训练集记录数据
batch_loss = pd.DataFrame(columns=['epoch','batch','loss'])

#验证集记录数据
epoch_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#测试集评分
epoch_test_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#保存模型比较值
max_f1 = 0
max_epoch = 0

#%% 训练函数
# 训练函数
def train():
    bert_classifier_model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    global batch_loss
    for batch in train_loader:
        # 正向传播
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)                
        total_train_loss += loss.item()
           
        loss.backward()
        nn.utils.clip_grad_norm_(bert_classifier_model.parameters(), 1.0)   #梯度裁剪，防止梯度爆炸
        
        # 参数更新
        optimizer.step()
        scheduler.step()


        # df1 = pd.DataFrame([[epoch,iter_num,loss.item]],columns=['epoch','batch','loss'])
        # batch_loss = batch_loss.append(df1, ignore_index=True)
        loss_each = total_train_loss/(iter_num+1)/batch_size

        df1 = pd.DataFrame([[epoch,iter_num,loss_each]],columns=['epoch','batch','loss'])
        batch_loss = batch_loss.append(df1, ignore_index=True)


        iter_num += 1
        if(iter_num % 100 == 0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss_each, iter_num/total_iter*100))
    batch_loss.to_csv('./result/batch_loss.csv')
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))


#%% 评价函数及测试

def validation(test_dataloader):
    bert_classifier_model.eval()
    pre_list = []
    prob_list = []

    with torch.no_grad():
        for batch in test_dataloader:
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
        
            loss = criterion(outputs, labels)
            # logits = outputs
            prob = outputs[:, 1].cpu().numpy()
            prob_list.extend(prob.tolist())
            _, predicted = torch.max(outputs, 1)
            predicted = predicted.cpu().numpy()
            torch.cuda.empty_cache()
            pre_list.extend(predicted.tolist())
    return pre_list,prob_list

def metric_show(pre_list,targets,prob_list,epoch,figlist):
    prescore= precision_score(targets,pre_list)
    accscore = accuracy_score(targets,pre_list)
    recascore = recall_score(targets,pre_list)
    f1score = f1_score(targets,pre_list)
    print("precision:{}".format(prescore))
    print("accuracy:{}".format(accscore))
    print("recascore:{}".format(recascore))
    print("f1:{}".format(f1score))
    # 
    save_path = './result/bert'
    plt_roc_pr(targets,prob_list,name = str(epoch),save_path=save_path,figlist=figlist)
    return prescore,accscore,recascore,f1score

#%% 训练开始
start = time.time()

for epoch in range(epochs):
    print("------------Epoch: %d ----------------" % epoch)
    start_epoch = time.time()
    train()

    #使用验证集比较epoch效果最好的模型
    pre_list,prob_list= validation(vali_loader)
    prescore,accscore,recascore,f1score = metric_show(pre_list,y_vali,prob_list,epoch,figlist=[0,1])

    epoch_scores.loc[epoch]=[epoch,prescore,accscore,recascore,f1score]
    epoch_scores.to_csv('./result/epoch_scores.csv')

    #得到各种指标，同时记录效果最好的F1和当时EPOCH,保存模型
    if epoch>5 and max_f1<f1score:
        max_f1 = f1score
        max_epoch = epoch
        print("最大F1:{}-对应EPOCH:{}".format(max_f1,max_epoch))
        save_directory = '../model_save/uncase_bert_stratify_best'
        bert_classifier_model.bert.module.save_pretrained(save_directory)

    
    #记录测试集数据
    pre_list,prob_list= validation(test_loader)
    t_prescore,t_accscore,t_recascore,t_f1score = metric_show(pre_list,y_test,prob_list,epoch = 't'+str(epoch),figlist=[2,3])

    epoch_test_scores.loc[epoch]=[epoch,t_prescore,t_accscore,t_recascore,t_f1score]
    epoch_test_scores.to_csv('./result/epoch_test_scores.csv')

    end_epoch = time.time()
    print('total model Running time:{} Seconds'.format(end_epoch-start_epoch))

end = time.time()
print('total model Running time:{} Seconds'.format(end-start))