任务简介:
学习一个简单的BERT意图分类项目,了解BERT进行NLP任务时的流程。
任务说明(本节):
- 模型评价方法
- 模型训练与评估
- 优化器
- (学习python进度条模块tqdm)
导入第三方库:
输入:
%cd ../
import os
import logging
import numpy as np
import torch
import random
from tqdm import tqdm, trange # tqdm模块是python进度条库
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup, DistilBertConfig, AlbertConfig
from transformers import BertTokenizer, DistilBertTokenizer, AlbertTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score
# 下面4个import的类和函数在第一篇笔记已经记录
from bert_finetune_cls.utils import MODEL_CLASSES, get_intent_labels
from bert_finetune_cls.model import ClsBERT
from bert_finetune_cls.utils import load_tokenizer, get_intent_labels
from bert_finetune_cls.data_loader import load_and_cache_examples
logger = logging.getLogger(__name__)
输出:
D:\notebook_workspace\BERT_cls
一、模型评价方法
代码:
# 计算评价指标
def compute_metrics(intent_preds, intent_labels):
"""
计算metrics
"""
assert len(intent_preds) == len(intent_labels)
results = {}
intent_result = get_intent_acc(intent_preds, intent_labels)
results.update(intent_result)
return results
def get_intent_acc(preds, labels):
acc = (preds == labels).mean()
return {
"intent_acc": acc
}
二、模型训练与评估方法
代码:
class Trainer(object):
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
self.args = args
self.train_dataset = train_dataset
self.dev_dataset = dev_dataset
self.test_dataset = test_dataset
# 加载模型,标签名称到编号的映射 (label maps):
self.intent_label_lst = get_intent_labels(args)
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
# 加载模型的config,model本身
self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.task) # finetuning_task=args.task 是在config中自定义添加的信息
self.model = self.model_class.from_pretrained(args.model_name_or_path,
config=self.config,
args=args,
intent_label_lst=self.intent_label_lst)
# 将模型放到GPU,如果有的话
# GPU or CPU
self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
self.model.to(self.device)
def train(self):
# 加载训练数据
train_sampler = RandomSampler(self.train_dataset)
train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)
# 计算训练的总的更新步数,用于learning rate的schedule (不是迭代步数)
if self.args.max_steps > 0:
t_total = self.args.max_steps
self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs
# 打印一下参数,看看都有哪些
for n, p in self.model.named_parameters():
print(n)
# 准备优化器、学习率调度器(线性层warmup预热和decay)
no_decay = ['bias', 'LayerNorm.weight'] # bias和层归一化操作中的参数做weight decay
optimizer_grouped_parameters = [
{'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay},
{'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) # eps:为了增加数值计算的稳定性而加到分母里的项
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)
# 训练!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(self.train_dataset))
logger.info(" Num Epochs = %d", self.args.num_train_epochs)
logger.info(" Total train batch size = %d", self.args.train_batch_size)
logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) # 梯度累加次数,详见:https://www.cnblogs.com/sddai/p/14598018.html
logger.info(" Total optimization steps = %d", t_total)
logger.info(" Logging steps = %d", self.args.logging_steps) # 计算dev performance
logger.info(" Save steps = %d", self.args.save_steps) # 保存model checkpoint
global_step = 0
tr_loss = 0.0
# # 神经网络训练通常的步骤:
self.model.zero_grad() # 1.训练前清空梯度
train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration")
for step, batch in enumerate(epoch_iterator):
self.model.train()
batch = tuple(t.to(self.device) for t in batch) # 将数据传到设备上面:GPU or CPU
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'intent_label_ids': batch[3],
}
if self.args.model_type != 'distilbert':
inputs['token_type_ids'] = batch[2]
outputs = self.model(**inputs) # 2.正向传播
loss = outputs[0] # 3.计算损失
if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps
loss.backward() # 4.反向传播,计算梯度
tr_loss += loss.item()
# 设置梯度清空、更新参数的间隔步数,如gradient_accumulation_steps = 3,则每隔3个batch清空一次梯度
if (step + 1) % self.args.gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) #
optimizer.step() # 5.更新参数
scheduler.step() # Update learning rate schedule
self.model.zero_grad() # 6.清空梯度
global_step += 1
if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
self.evaluate("dev")
if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
self.save_model()
if 0 < self.args.max_steps < global_step:
epoch_iterator.close()
break
if 0 < self.args.max_steps < global_step:
train_iterator.close()
break
return global_step, tr_loss / global_step
def evaluate(self, mode):
if mode == 'test':
dataset = self.test_dataset
elif mode == 'dev':
dataset = self.dev_dataset
else:
raise Exception("Only dev and test dataset available")
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
# 评估!
logger.info("***** Running evaluation on %s dataset *****", mode)
logger.info(" Num examples = %d", len(dataset))
logger.info(" Batch size = %d", self.args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
intent_preds = None
out_intent_label_ids = None
self.model.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating"):
batch = tuple(t.to(self.device) for t in batch)
with torch.no_grad():
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'intent_label_ids': batch[3],
}
if self.args.model_type != 'distilbert':
inputs['token_type_ids'] = batch[2]
outputs = self.model(**inputs)
tmp_eval_loss, intent_logits = outputs[:2]
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
# Intent prediction
if intent_preds is None:
intent_preds = intent_logits.detach().cpu().numpy()
out_intent_label_ids = inputs['intent_label_ids'].detach().cpu().numpy()
else:
intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)
out_intent_label_ids = np.append(
out_intent_label_ids, inputs['intent_label_ids'].detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
results = {
"loss": eval_loss
}
# Intent result
intent_preds = np.argmax(intent_preds, axis=1)
total_result = compute_metrics(intent_preds, out_intent_label_ids)
results.update(total_result)
logger.info("***** Eval results *****")
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
return results
def save_model(self):
# Save model checkpoint (Overwrite)
if not os.path.exists(self.args.model_dir):
os.makedirs(self.args.model_dir)
model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
model_to_save.save_pretrained(self.args.model_dir)
# Save training arguments together with the trained model
torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", self.args.model_dir)
def load_model(self):
# Check whether model exists
if not os.path.exists(self.args.model_dir):
raise Exception("Model doesn't exists! Train first!")
try:
self.model = self.model_class.from_pretrained(self.args.model_dir,
args=self.args,
intent_label_lst=self.intent_label_lst)
self.model.to(self.device)
logger.info("***** Model Loaded *****")
except:
raise Exception("Some model files might be missing...")
三、模型训练与评估
1. 实例化模型
输入:
MODEL_CLASSES = {
'bert': (BertConfig, ClsBERT, BertTokenizer),
}
MODEL_PATH_MAP = {
'bert': 'bert_finetune_cls/resources/uncased_L-2_H-128_A-2',
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if not args.no_cuda and torch.cuda.is_available():
torch.cuda.manual_seed_all(args.seed)
# 先构建参数
class Args():
task = None
data_dir = None
intent_label_file = None
args = Args()
args.seed = 1991
args.no_cuda = True
args.task = "atis"
args.data_dir = "./bert_finetune_cls/data"
args.intent_label_file = "intent_label.txt"
args.max_seq_len = 50
args.model_type = "bert"
args.model_dir = "bert_finetune_cls/experiments/outputs/clsbert_0"
args.model_name_or_path = MODEL_PATH_MAP[args.model_type]
args.train_batch_size = 8
args.eval_batch_size = 16
args.dropout_rate = 0.1
args.max_steps = 1000
args.num_train_epochs = 1
args.gradient_accumulation_steps = 1
args.weight_decay = 1e-5
args.learning_rate = 1e-5
args.adam_epsilon = 1e-8
args.max_grad_norm = 1.0
args.warmup_steps = 100
args.logging_steps = 100
args.save_steps = 200
# 设置随机种子
set_seed(args)
# 加载tokenizer
tokenizer = load_tokenizer(args)
# 加载数据集
train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
test_dataset = load_and_cache_examples(args, tokenizer, mode="test")
# 加载trainer
trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
输出:
tensor([[ 101, 1045, 2215, ..., 0, 0, 0],
[ 101, 2461, 4440, ..., 0, 0, 0],
[ 101, 2265, 2033, ..., 0, 0, 0],
...,
[ 101, 2425, 2033, ..., 0, 0, 0],
[ 101, 1045, 1005, ..., 0, 0, 0],
[ 101, 2003, 2045, ..., 0, 0, 0]])
<class 'torch.Tensor'>
tensor([[ 101, 1045, 2215, ..., 0, 0, 0],
[ 101, 2265, 2033, ..., 0, 0, 0],
[ 101, 1045, 2052, ..., 0, 0, 0],
...,
[ 101, 2054, 7599, ..., 0, 0, 0],
[ 101, 2265, 2033, ..., 0, 0, 0],
[ 101, 2054, 7599, ..., 0, 0, 0]])
<class 'torch.Tensor'>
Some weights of the model checkpoint at bert_finetune_cls/resources/uncased_L-2_H-128_A-2 were not used when initializing ClsBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing ClsBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ClsBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ClsBERT were not initialized from the model checkpoint at bert_finetune_cls/resources/uncased_L-2_H-128_A-2 and are newly initialized: ['intent_classifier.linear.weight', 'intent_classifier.linear.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
tensor([[ 101, 1045, 2052, ..., 0, 0, 0],
[ 101, 2006, 2258, ..., 0, 0, 0],
[ 101, 2006, 2258, ..., 0, 0, 0],
...,
[ 101, 1045, 1005, ..., 0, 0, 0],
[ 101, 1045, 2052, ..., 0, 0, 0],
[ 101, 2424, 2033, ..., 0, 0, 0]])
<class 'torch.Tensor'>
2. 模型训练
输入:
# 训练
trainer.train()
输出:
Epoch: 0%| | 0/2 [00:00<?, ?it/s]
Iteration: 0%| | 0/560 [00:00<?, ?it/s]
Iteration: 0%|▏ | 1/560 [00:00<01:12, 7.68it/s]
bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
······
bert.encoder.layer.1.output.LayerNorm.bias
bert.pooler.dense.weight
bert.pooler.dense.bias
intent_classifier.linear.weight
intent_classifier.linear.bias
Iteration: 1%|▍ | 3/560 [00:00<00:59, 9.35it/s]
Iteration: 1%|▋ | 5/560 [00:00<00:50, 10.99it/s]
Iteration: 1%|▉ | 7/560 [00:00<00:43, 12.57it/s]
Iteration: 2%|█▏ | 9/560 [00:00<00:39, 14.11it/s]
Iteration: 2%|█▍ | 11/560 [00:00<00:35, 15.46it/s]
Iteration: 2%|█▋ | 13/560 [00:00<00:33, 16.48it/s]
Iteration: 3%|█▉ | 15/560 [00:00<00:31, 17.15it/s]
Iteration: 3%|██▏ | 17/560 [00:00<00:30, 17.59it/s]
Iteration: 3%|██▍ | 19/560 [00:01<00:29, 18.11it/s]
Iteration: 4%|██▋ | 21/560 [00:01<00:29, 18.39it/s]
Iteration: 4%|██▉ | 23/560 [00:01<00:29, 18.48it/s]
Iteration: 4%|███▏ | 25/560 [00:01<00:29, 18.21it/s]
Iteration: 5%|███▍ | 27/560 [00:01<00:29, 17.79it/s]
Iteration: 5%|███▋ | 29/560 [00:01<00:29, 17.84it/s]
······
Evaluating: 0%| | 0/32 [00:00<?, ?it/s]
Evaluating: 25%|█████████████████▊ | 8/32 [00:00<00:00, 74.72it/s]
Evaluating: 53%|█████████████████████████████████████▏ | 17/32 [00:00<00:00, 76.61it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 83.88it/s]
Iteration: 18%|████████████▎ | 100/560 [00:05<00:50, 9.14it/s]
Iteration: 18%|████████████▌ | 102/560 [00:05<00:41, 10.91it/s]
Iteration: 19%|████████████▊ | 104/560 [00:05<00:36, 12.51it/s]
······
Iteration: 76%|████████████████████████████████████████████████████▋ | 428/560 [00:25<00:08, 16.11it/s]
Iteration: 77%|████████████████████████████████████████████████████▉ | 430/560 [00:26<00:07, 16.32it/s]
Iteration: 77%|█████████████████████████████████████████████████████▏ | 432/560 [00:26<00:08, 15.96it/s]
Iteration: 78%|█████████████████████████████████████████████████████▍ | 434/560 [00:26<00:07, 16.03it/s]
Iteration: 78%|█████████████████████████████████████████████████████▋ | 436/560 [00:26<00:07, 15.92it/s]
Iteration: 78%|█████████████████████████████████████████████████████▉ | 438/560 [00:26<00:07, 16.11it/s]
Evaluating: 0%| | 0/32 [00:00<?, ?it/s]
Evaluating: 22%|███████████████▌ | 7/32 [00:00<00:00, 69.40it/s]
Evaluating: 44%|██████████████████████████████▋ | 14/32 [00:00<00:00, 68.31it/s]
Evaluating: 66%|█████████████████████████████████████████████▉ | 21/32 [00:00<00:00, 66.28it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 67.22it/s]
Iteration: 79%|██████████████████████████████████████████████████████▏ | 440/560 [00:27<00:07, 16.17it/s]
Epoch: 50%|██████████████████████████████████████▌ | 1/2 [00:58<00:58, 58.83s/it]
3. 模型评估
输入:
# 评估
trainer.load_model()
trainer.evaluate("dev")
输出:
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 90.84it/s]
{'loss': 1.2779558580368757, 'intent_acc': 0.714}
输入:
trainer.evaluate("test")
输出:
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 86.29it/s]
{'loss': 1.294722783778395, 'intent_acc': 0.7077267637178052}
4. python进度条模块tqdm
最后学习一下 python的进度条模块:tqdm,非常实用。
输入:
train_iterator1 = trange(3, desc="Epoch")
dic = ['a', 'b', 'c']
for i in train_iterator1:
pbar = tqdm(dic) # tqdm
for _ in pbar:
pbar.set_description('Processing '+str(_))
time.sleep(0.5)
输出:
Epoch: 0%| | 0/3 [00:00<?, ?it/s]
0%| | 0/3 [00:00<?, ?it/s]
Processing a: 0%| | 0/3 [00:00<?, ?it/s]
Processing a: 33%|███████████████████████▎ | 1/3 [00:00<00:01, 1.96it/s]
Processing b: 33%|███████████████████████▎ | 1/3 [00:00<00:01, 1.96it/s]
Processing b: 67%|██████████████████████████████████████████████▋ | 2/3 [00:01<00:00, 1.96it/s]
Processing c: 67%|██████████████████████████████████████████████▋ | 2/3 [00:01<00:00, 1.96it/s]
Processing c: 100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00, 1.96it/s]
Epoch: 33%|█████████████████████████▋ | 1/3 [00:01<00:03, 1.53s/it]
0%| | 0/3 [00:00<?, ?it/s]
Processing a: 0%| | 0/3 [00:00<?, ?it/s]
Processing a: 33%|███████████████████████▎ | 1/3 [00:00<00:01, 1.98it/s]
Processing b: 33%|███████████████████████▎ | 1/3 [00:00<00:01, 1.98it/s]
Processing b: 67%|██████████████████████████████████████████████▋ | 2/3 [00:01<00:00, 1.97it/s]
Processing c: 67%|██████████████████████████████████████████████▋ | 2/3 [00:01<00:00, 1.97it/s]
Processing c: 100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00, 1.96it/s]
Epoch: 67%|███████████████████████████████████████████████████▎ | 2/3 [00:03<00:01, 1.53s/it]
0%| | 0/3 [00:00<?, ?it/s]
Processing a: 0%| | 0/3 [00:00<?, ?it/s]
Processing a: 33%|███████████████████████▎ | 1/3 [00:00<00:01, 1.97it/s]
Processing b: 33%|███████████████████████▎ | 1/3 [00:00<00:01, 1.97it/s]
Processing b: 67%|██████████████████████████████████████████████▋ | 2/3 [00:01<00:00, 1.98it/s]
Processing c: 67%|██████████████████████████████████████████████▋ | 2/3 [00:01<00:00, 1.98it/s]
Processing c: 100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00, 1.96it/s]
Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00, 1.53s/it]