本文使用了五折交叉验证和早停(early stop)方法
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import classification_report
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from transformers import logging
import torch.nn.functional as F
import os
from Attention import TopicEnhance
from early_stopping import EarlyStopping
import time
localtime=time.strftime("%m%d%H%M%S", time.localtime())
logging.set_verbosity_error()
filename = 'twitter15/twitter15.csv'
# filename = 'Pheme_New.csv'
pretrained_model = 'bert-base-cased'
cuda_id = 0
save_path = os.path.join(os.path.dirname(__file__), 'save/'+localtime)
print("save_path:"+save_path)
if not os.path.exists(save_path):
os.makedirs(save_path)
tokenizer = BertTokenizer.from_pretrained(pretrained_model)
labels = {'non-rumor': 0,
'true': 1,
'false': 2,
'unverified': 3,
}
'''
labels = {'rumours': 0,
'non-rumours': 1,
}
'''
num_labels=len(labels)
def get_device(gpu_id):
device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
print("device is cuda, # cuda is: ", gpu_id)
else:
print("device is cpu, not recommend")
return device
device = get_device(cuda_id)
class Dataset(torch.utils.data.Dataset):
def __init__(self, df):
self.labels = [labels[label] for label in df[2]]
self.texts = [tokenizer(text,
padding='max_length',
max_length=50,
truncation=True,
return_tensors="pt")
for text in df[1]]
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
# Fetch a batch of labels
return np.array(self.labels[idx])
def get_batch_texts(self, idx):
# Fetch a batch of inputs
return self.texts[idx]
def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx)
batch_y = self.get_batch_labels(idx)
return batch_texts, batch_y
class BertOrigin(nn.Module):
def __init__(self, dropout=0.1):
super(BertOrigin, self).__init__()
self.bert = BertModel.from_pretrained(pretrained_model)
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_labels)
def forward(self, input_id, mask):
encoded_layers, _ = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(encoded_layers)
cls = dropout_output[:, 0]
linear_output = self.linear(cls)
return linear_output
def train(model, train_data, val_data, learning_rate, epochs, early_stop, model_name):
early_stopping = EarlyStopping(save_path=save_path, patience=early_stop)
# 通过Dataset类获取训练和验证集
train, val = Dataset(train_data), Dataset(val_data)
# DataLoader根据batch_size获取数据,训练时选择打乱样本
train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=16)
# 判断是否使用GPU
use_cuda = torch.cuda.is_available()
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
if use_cuda:
model = model.to(device)
criterion = criterion.to(device)
# 开始进入训练循环
for epoch_num in range(epochs):
# 定义两个变量,用于存储训练集的准确率和损失
total_acc_train = 0
total_loss_train = 0
# 进度条函数tqdm
for train_input, train_label in tqdm(train_dataloader):
train_label = train_label.type(torch.LongTensor)
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
input_id = train_input['input_ids'].squeeze(1).to(device)
# 通过模型得到输出
output = model(input_id, mask)
# 计算损失
batch_loss = criterion(output, train_label)
total_loss_train += batch_loss.item()
# 计算精度
acc = (output.argmax(dim=1) == train_label).sum().item()
total_acc_train += acc
# 模型更新
model.zero_grad()
batch_loss.backward()
optimizer.step()
# ------ 验证模型 -----------
# 定义两个变量,用于存储验证集的准确率和损失
total_acc_val = 0
total_loss_val = 0
# model.eval()
# 不需要计算梯度
# 循环获取数据集,并用训练好的模型进行验证
for val_input, val_label in val_dataloader:
# 如果有GPU,则使用GPU,接下来的操作同训练
val_label = val_label.type(torch.LongTensor)
val_label = val_label.to(device)
mask = val_input['attention_mask'].to(device)
input_id = val_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, val_label)
total_loss_val += batch_loss.item()
acc = (output.argmax(dim=1) == val_label).sum().item()
total_acc_val += acc
print(
f'''Epochs: {epoch_num + 1}
| Train Loss: {total_loss_train / len(train_data): .3f}
| Train Accuracy: {total_acc_train / len(train_data): .3f}
| Val Loss: {total_loss_val / len(val_data): .3f}
| Val Accuracy: {total_acc_val / len(val_data): .3f}''')
# 早停止
early_stopping(total_acc_val, model, model_name)
# 达到早停止条件时,early_stop会被置为True
if early_stopping.early_stop:
print("Early stopping")
break # 跳出迭代,结束训练
def evaluate(model, test_data, model_name):
test = Dataset(test_data)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=16)
use_cuda = torch.cuda.is_available()
dict_path = os.path.join(save_path, model_name, 'best_network.pth')
model.load_state_dict(torch.load(dict_path))
if use_cuda:
model = model.to(device)
total_acc_test = 0
y_true = []
y_pred = []
with torch.no_grad():
for test_input, test_label in test_dataloader:
test_label = test_label.to(device)
mask = test_input['attention_mask'].to(device)
input_id = test_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
y_true.extend(test_label.cpu().numpy())
y_pred.extend(output.argmax(dim=1).cpu().numpy())
# acc = (output.argmax(dim=1) == test_label).sum().item()
# total_acc_test += acc
# print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
report = classification_report(y_true, y_pred, zero_division="warn", digits=4, output_dict=True)
df = pd.DataFrame(report)[:3].T
print(df)
return df
EPOCHS = 10
LR = 5e-5
EARLY_STOP = 5
df = pd.read_csv(filename, sep='\t', header=None)
model_list = ['BertOrigin', 'BertMean', 'BertTMean', 'BertTWMean']
test_list = ['BertOrigin']
input_list = model_list
print("Models:", model_list)
report_list = [pd.DataFrame() for _ in range(len(input_list))]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 定义一个函数来训练和评估任何Bert模型
def train_and_evaluate(model, df_train, df_val, df_test, LR, EPOCHS, early_stop, model_name):
train(model, df_train, df_val, LR, EPOCHS, early_stop, model_name)
return evaluate(model, df_test, model_name)
# 用一个循环来遍历模型列表
for i, (train_index, test_index) in enumerate(skf.split(df[1], df[2])):
df_train, df_test = df.iloc[train_index], df.iloc[test_index]
df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=42)
print("-----" + str(i + 1) + "folder" + "-----" + str(len(df_train)) + "-----" + str(len(df_val)) + "-----" + str(
len(df_test)))
for j, model_name in enumerate(input_list):
print("-----" + str(i + 1) + "folder" + "-----" + model_name + "-----")
model = eval(model_name)() # 创建模型实例
report = train_and_evaluate(model, df_train, df_val, df_test, LR, EPOCHS, EARLY_STOP, model_name) # 训练和评估模型
report_list[j] = report_list[j].add(report, fill_value=0) # 累加评估结果
print("-----the result of 5folder-----")
for i, model_name in enumerate(input_list):
report = report_list[i] / 5
print("-----" + model_name + "----- Acc:" + str(report['precision']['accuracy'])) # 打印平均评估结果
print(report)
早停实现:
import numpy as np
import torch
import os
class EarlyStopping:
"""Early stops the training if validation loss doesn't improve after a given patience."""
def __init__(self, save_path, patience=7, verbose=False, delta=0):
"""
Args:
save_path : 模型保存文件夹
patience (int): How long to wait after last time validation loss improved.
Default: 7
verbose (bool): If True, prints a message for each validation loss improvement.
Default: False
delta (float): Minimum change in the monitored quantity to qualify as an improvement.
Default: 0
"""
self.save_path = save_path
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = np.Inf
self.delta = delta
def __call__(self, val_loss, model,model_name):
score = val_loss
if self.best_score is None:
self.best_score = score
self.save_checkpoint(val_loss, model,model_name)
elif score < self.best_score + self.delta:
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(val_loss, model,model_name)
self.counter = 0
def save_checkpoint(self, val_loss, model,model_name):
'''Saves model when validation loss decrease.'''
if self.verbose:
print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
if not os.path.exists(os.path.join(self.save_path, model_name)):
os.makedirs(os.path.join(self.save_path, model_name))
path = os.path.join(self.save_path, model_name,'best_network.pth')
torch.save(model.state_dict(), path) # 这里会存储迄今最优模型的参数
self.val_loss_min = val_loss