导入命名实体
import torch
import pandas as pd
import numpy as np
path = './'
comments = pd.read_csv(path + '英文命名实体信息.csv', encoding="latin1").fillna(method="ffill")
print('命名实体总数:%d' % comments.shape[0])
Tags = list(set(comments['Tag']))
for tag in Tags:
print('命名实体({}):{}'.format(tag, comments[comments.Tag==tag].shape[0]))
命名实体总数:1048575
命名实体(B-per):16990
命名实体(B-geo):37644
命名实体(I-gpe):198
命名实体(I-geo):7414
命名实体(I-org):16784
命名实体(B-org):20143
命名实体(B-tim):20333
命名实体(B-art):402
命名实体(I-per):17251
命名实体(I-tim):6528
命名实体(B-nat):201
命名实体(B-eve):308
命名实体(I-nat):51
命名实体(I-eve):253
命名实体(B-gpe):15870
命名实体(O):887908
命名实体(I-art):297
SentenceMaking 把实体组成句子
class SentenceMaking(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),s["POS"].values.tolist(),s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
group_sentences = SentenceMaking(comments)
sentences = [" ".join([s[0] for s in sent]) for sent in group_sentences.sentences]
tags = [[s[2] for s in sent] for sent in group_sentences.sentences]
# tag to id
tag2idx = {t: i for i, t in enumerate(Tags)}
# 标记所有实体名称
labels = list(tag2idx.values())
labels
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
简单查看一下句子
sentences[0:5]
['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
'Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .',
'Helicopter gunships Saturday pounded militant hideouts in the Orakzai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South Waziristan .',
'They left after a tense hour-long standoff with riot police .',
'U.N. relief coordinator Jan Egeland said Sunday , U.S. , Indonesian and Australian military helicopters are ferrying out food and supplies to remote areas of western Aceh province that ground crews can not reach .']
最长句子的长度为 541
max_len = 0
lengthOfsentence = []
# 循环每一个句子...
for sent in sentences:
lengthOfsentence.append(len(sent))
# 找到句子最大长度
max_len = max(max_len, len(sent))
print('最长的句子长度为: ', max_len)
最长的句子长度为: 541
根据观察,大多数句子长度在250 到300,padding 时候的max_length 我们取256
import matplotlib.pyplot as plt
plt.plot(lengthOfsentence)
plt.ylabel('some numbers')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RoATBnSA-1591968778062)(bert%E5%91%BD%E5%90%8D%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%AB_files/bert%E5%91%BD%E5%90%8D%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%AB_9_0.png)]
下载 BERT tokenizer.
from transformers import BertTokenizer
print('下载 BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
下载 BERT tokenizer...
为句子中词的tag加上padding
tags_masks = []
for tag in tags:
tag_id = np.zeros(256)
for i,item in enumerate(tag):
tag_id[i]= tag2idx[item]
tags_masks.append(tag_id)
tags_masks = torch.tensor(tags_masks,dtype=torch.int64)
tags_masks[0]
tensor([15, 15, 15, 15, 15, 15, 1, 15, 15, 15, 15, 15, 1, 15, 15, 15, 15, 15,
14, 15, 15, 15, 15, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0])
bert模型输入input_ids,attention_masks 和 labels
input_ids = []
attention_masks = []
for sent in sentences:
# `encode_plus` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
# (5) Pad or truncate the sentence to `max_length`
# (6) Create attention masks for [PAD] tokens.
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 256, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
# 把编码的句子加入list.
input_ids.append(encoded_dict['input_ids'])
# 加上 attention mask (simply differentiates padding from non-padding).
attention_masks.append(encoded_dict['attention_mask'])
# 把lists 转为 tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = tags_masks
设计 training,validation 和 test dataset
from torch.utils.data import TensorDataset, random_split
# 把input 放入 TensorDataset。
dataset = TensorDataset(input_ids, attention_masks, labels)
# 计算 train_size 和 val_size 的长度.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
# 90% 的dataset 为train_dataset, 10% 的的dataset 为val_dataset.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} 训练数据'.format(train_size))
print('{:>5,} 验证数据'.format(val_size))
43,163 训练数据
4,796 验证数据
制作dataloader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# 推荐batch_size 为 16 或者 32
batch_size = 16
# 为训练数据集和验证数据集设计DataLoaders.
train_dataloader = DataLoader(
train_dataset, # 训练数据.
sampler = RandomSampler(train_dataset), # 打乱顺序
batch_size = batch_size
)
validation_dataloader = DataLoader(
val_dataset, # 验证数据.
sampler = RandomSampler(val_dataset), # 打乱顺序
batch_size = batch_size
)
导入 bert 命名实体模型 BertForTokenClassification
from transformers import BertForTokenClassification, AdamW, BertConfig
model = BertForTokenClassification.from_pretrained(
"bert-base-uncased", # 使用 12-layer 的 BERT 模型.
num_labels = len(tag2idx), # 多分类任务的输出标签为 len(tag2idx)个.
output_attentions = False, # 不返回 attentions weights.
output_hidden_states = False, # 不返回 all hidden-states.
)
model.cuda()
BertForTokenClassification(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=17, bias=True)
)
选择优化器
# AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - 默认是 5e-5
eps = 1e-8 # args.adam_epsilon - 默认是 1e-8, 是为了防止衰减率分母除到0
)
设计learning rate scheduler, 调整learning rate.
from transformers import get_linear_schedule_with_warmup
# bert 推荐 epochs 在2到4之间为好。
epochs = 2
# training steps 的数量: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
flat_accuracy 计算模型准确率
def flat_accuracy(preds, labels,attention):
pred_flat = (np.argmax(preds, axis=2)*attention).flatten()
labels_flat = labels.flatten()
atten = attention.flatten()
return np.sum((pred_flat == labels_flat)*atten)/ np.sum(atten)
format_time 计算所用时间
import time
import datetime
def format_time(elapsed):
elapsed_rounded = int(round((elapsed)))
# 返回 hh:mm:ss 形式的时间
return str(datetime.timedelta(seconds=elapsed_rounded))
训练模型
import os
import random
import numpy as np
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = './'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# 代码参考 https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# 设置随机种子.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# 记录training ,validation loss ,validation accuracy and timings.
training_stats = []
# 设置总时间.
total_t0 = time.time()
best_val_accuracy = 0
for epoch_i in range(0, epochs):
print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
# 记录每个 epoch 所用的时间
t0 = time.time()
total_train_loss = 0
total_train_accuracy = 0
model.train()
for step, batch in enumerate(train_dataloader):
# 每隔40个batch 输出一下所用时间.
if step % 40 == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# `batch` 包括3个 tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# 清空梯度
model.zero_grad()
# forward
# 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
loss, logits = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
total_train_loss += loss.item()
# backward 更新 gradients.
loss.backward()
# 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# 更新模型参数
optimizer.step()
# 更新 learning rate.
scheduler.step()
logit = logits.detach().cpu().numpy()
label_id = b_labels.to('cpu').numpy()
attention_mask = b_input_mask.cpu().numpy()
# 计算training 句子的准确度.
total_train_accuracy += flat_accuracy(logit, label_id,attention_mask)
# 计算batches的平均损失.
avg_train_loss = total_train_loss / len(train_dataloader)
# 计算训练时间.
training_time = format_time(time.time() - t0)
# 训练集的准确率.
avg_train_accuracy = total_train_accuracy / len(train_dataloader)
print(" 训练准确率: {0:.2f}".format(avg_train_accuracy))
print(" 平均训练损失 loss: {0:.2f}".format(avg_train_loss))
print(" 训练时间: {:}".format(training_time))
# ========================================
# Validation
# ========================================
t0 = time.time()
# 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同
model.eval()
# 设置参数
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
for batch in validation_dataloader:
# `batch` 包括3个 tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# 在valuation 状态,不更新权值,不改变计算图
with torch.no_grad():
# 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
(loss, logits) = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# 计算 validation loss.
total_eval_loss += loss.item()
logit = logits.detach().cpu().numpy()
label_id = b_labels.to('cpu').numpy()
attention_mask = b_input_mask.cpu().numpy()
# 计算 validation 句子的准确度.
total_eval_accuracy += flat_accuracy(logit, label_id,attention_mask)
# 计算 validation 的准确率.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print("")
print(" 测试准确率: {0:.2f}".format(avg_val_accuracy))
if avg_val_accuracy > best_val_accuracy:
best_val_accuracy = avg_val_accuracy
torch.save(model.state_dict(),output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)
# 计算batches的平均损失.
avg_val_loss = total_eval_loss / len(validation_dataloader)
# 计算validation 时间.
validation_time = format_time(time.time() - t0)
print(" 平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
print(" 测试时间: {:}".format(validation_time))
# 记录模型参数
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
Epoch 1 / 2
Batch 40 of 2,698. Elapsed: 0:00:53.
Batch 80 of 2,698. Elapsed: 0:01:46.
Batch 120 of 2,698. Elapsed: 0:02:39.
Batch 160 of 2,698. Elapsed: 0:03:32.
Batch 200 of 2,698. Elapsed: 0:04:24.
Batch 240 of 2,698. Elapsed: 0:05:17.
Batch 280 of 2,698. Elapsed: 0:06:10.
Batch 320 of 2,698. Elapsed: 0:07:03.
Batch 360 of 2,698. Elapsed: 0:07:56.
Batch 400 of 2,698. Elapsed: 0:08:48.
Batch 440 of 2,698. Elapsed: 0:09:41.
Batch 480 of 2,698. Elapsed: 0:10:34.
Batch 520 of 2,698. Elapsed: 0:11:27.
Batch 560 of 2,698. Elapsed: 0:12:20.
Batch 600 of 2,698. Elapsed: 0:13:13.
Batch 640 of 2,698. Elapsed: 0:14:05.
Batch 680 of 2,698. Elapsed: 0:14:58.
Batch 720 of 2,698. Elapsed: 0:15:51.
Batch 760 of 2,698. Elapsed: 0:16:44.
Batch 800 of 2,698. Elapsed: 0:17:37.
Batch 840 of 2,698. Elapsed: 0:18:30.
Batch 880 of 2,698. Elapsed: 0:19:22.
Batch 920 of 2,698. Elapsed: 0:20:15.
Batch 960 of 2,698. Elapsed: 0:21:08.
Batch 1,000 of 2,698. Elapsed: 0:22:01.
Batch 1,040 of 2,698. Elapsed: 0:22:54.
Batch 1,080 of 2,698. Elapsed: 0:23:47.
Batch 1,120 of 2,698. Elapsed: 0:24:40.
Batch 1,160 of 2,698. Elapsed: 0:25:33.
Batch 1,200 of 2,698. Elapsed: 0:26:26.
Batch 1,240 of 2,698. Elapsed: 0:27:18.
Batch 1,280 of 2,698. Elapsed: 0:28:11.
Batch 1,320 of 2,698. Elapsed: 0:29:04.
Batch 1,360 of 2,698. Elapsed: 0:29:57.
Batch 1,400 of 2,698. Elapsed: 0:30:50.
Batch 1,440 of 2,698. Elapsed: 0:31:43.
Batch 1,480 of 2,698. Elapsed: 0:32:36.
Batch 1,520 of 2,698. Elapsed: 0:33:28.
Batch 1,560 of 2,698. Elapsed: 0:34:21.
Batch 1,600 of 2,698. Elapsed: 0:35:14.
Batch 1,640 of 2,698. Elapsed: 0:36:07.
Batch 1,680 of 2,698. Elapsed: 0:37:00.
Batch 1,720 of 2,698. Elapsed: 0:37:52.
Batch 1,760 of 2,698. Elapsed: 0:38:45.
Batch 1,800 of 2,698. Elapsed: 0:39:38.
Batch 1,840 of 2,698. Elapsed: 0:40:31.
Batch 1,880 of 2,698. Elapsed: 0:41:24.
Batch 1,920 of 2,698. Elapsed: 0:42:17.
Batch 1,960 of 2,698. Elapsed: 0:43:09.
Batch 2,000 of 2,698. Elapsed: 0:44:02.
Batch 2,040 of 2,698. Elapsed: 0:44:55.
Batch 2,080 of 2,698. Elapsed: 0:45:48.
Batch 2,120 of 2,698. Elapsed: 0:46:41.
Batch 2,160 of 2,698. Elapsed: 0:47:34.
Batch 2,200 of 2,698. Elapsed: 0:48:27.
Batch 2,240 of 2,698. Elapsed: 0:49:19.
Batch 2,280 of 2,698. Elapsed: 0:50:12.
Batch 2,320 of 2,698. Elapsed: 0:51:05.
Batch 2,360 of 2,698. Elapsed: 0:51:58.
Batch 2,400 of 2,698. Elapsed: 0:52:51.
Batch 2,440 of 2,698. Elapsed: 0:53:43.
Batch 2,480 of 2,698. Elapsed: 0:54:36.
Batch 2,520 of 2,698. Elapsed: 0:55:29.
Batch 2,560 of 2,698. Elapsed: 0:56:22.
Batch 2,600 of 2,698. Elapsed: 0:57:15.
Batch 2,640 of 2,698. Elapsed: 0:58:08.
Batch 2,680 of 2,698. Elapsed: 0:59:01.
训练准确率: 0.95
平均训练损失 loss: 0.16
训练时间: 0:59:24
测试准确率: 0.96
平均测试损失 Loss: 0.14
测试时间: 0:02:24
Epoch 2 / 2
Batch 40 of 2,698. Elapsed: 0:00:53.
Batch 80 of 2,698. Elapsed: 0:01:46.
Batch 120 of 2,698. Elapsed: 0:02:39.
Batch 160 of 2,698. Elapsed: 0:03:31.
Batch 200 of 2,698. Elapsed: 0:04:24.
Batch 240 of 2,698. Elapsed: 0:05:17.
Batch 280 of 2,698. Elapsed: 0:06:10.
Batch 320 of 2,698. Elapsed: 0:07:03.
Batch 360 of 2,698. Elapsed: 0:07:56.
Batch 400 of 2,698. Elapsed: 0:08:49.
Batch 440 of 2,698. Elapsed: 0:09:42.
Batch 480 of 2,698. Elapsed: 0:10:35.
Batch 520 of 2,698. Elapsed: 0:11:28.
Batch 560 of 2,698. Elapsed: 0:12:20.
Batch 600 of 2,698. Elapsed: 0:13:13.
Batch 640 of 2,698. Elapsed: 0:14:06.
Batch 680 of 2,698. Elapsed: 0:14:59.
Batch 720 of 2,698. Elapsed: 0:15:52.
Batch 760 of 2,698. Elapsed: 0:16:45.
Batch 800 of 2,698. Elapsed: 0:17:37.
Batch 840 of 2,698. Elapsed: 0:18:30.
Batch 880 of 2,698. Elapsed: 0:19:23.
Batch 920 of 2,698. Elapsed: 0:20:16.
Batch 960 of 2,698. Elapsed: 0:21:09.
Batch 1,000 of 2,698. Elapsed: 0:22:02.
Batch 1,040 of 2,698. Elapsed: 0:22:54.
Batch 1,080 of 2,698. Elapsed: 0:23:47.
Batch 1,120 of 2,698. Elapsed: 0:24:40.
Batch 1,160 of 2,698. Elapsed: 0:25:33.
Batch 1,200 of 2,698. Elapsed: 0:26:26.
Batch 1,240 of 2,698. Elapsed: 0:27:18.
Batch 1,280 of 2,698. Elapsed: 0:28:11.
Batch 1,320 of 2,698. Elapsed: 0:29:04.
Batch 1,360 of 2,698. Elapsed: 0:29:57.
Batch 1,400 of 2,698. Elapsed: 0:30:50.
Batch 1,440 of 2,698. Elapsed: 0:31:42.
Batch 1,480 of 2,698. Elapsed: 0:32:35.
Batch 1,520 of 2,698. Elapsed: 0:33:28.
Batch 1,560 of 2,698. Elapsed: 0:34:21.
Batch 1,600 of 2,698. Elapsed: 0:35:14.
Batch 1,640 of 2,698. Elapsed: 0:36:07.
Batch 1,680 of 2,698. Elapsed: 0:36:59.
Batch 1,720 of 2,698. Elapsed: 0:37:52.
Batch 1,760 of 2,698. Elapsed: 0:38:45.
Batch 1,800 of 2,698. Elapsed: 0:39:38.
Batch 1,840 of 2,698. Elapsed: 0:40:31.
Batch 1,880 of 2,698. Elapsed: 0:41:24.
Batch 1,920 of 2,698. Elapsed: 0:42:17.
Batch 1,960 of 2,698. Elapsed: 0:43:09.
Batch 2,000 of 2,698. Elapsed: 0:44:02.
Batch 2,040 of 2,698. Elapsed: 0:44:55.
Batch 2,080 of 2,698. Elapsed: 0:45:48.
Batch 2,120 of 2,698. Elapsed: 0:46:41.
Batch 2,160 of 2,698. Elapsed: 0:47:34.
Batch 2,200 of 2,698. Elapsed: 0:48:27.
Batch 2,240 of 2,698. Elapsed: 0:49:19.
Batch 2,280 of 2,698. Elapsed: 0:50:12.
Batch 2,320 of 2,698. Elapsed: 0:51:05.
Batch 2,360 of 2,698. Elapsed: 0:51:58.
Batch 2,400 of 2,698. Elapsed: 0:52:51.
Batch 2,440 of 2,698. Elapsed: 0:53:44.
Batch 2,480 of 2,698. Elapsed: 0:54:37.
Batch 2,520 of 2,698. Elapsed: 0:55:29.
Batch 2,560 of 2,698. Elapsed: 0:56:22.
Batch 2,600 of 2,698. Elapsed: 0:57:15.
Batch 2,640 of 2,698. Elapsed: 0:58:08.
Batch 2,680 of 2,698. Elapsed: 0:59:01.
训练准确率: 0.96
平均训练损失 loss: 0.12
训练时间: 0:59:24
测试准确率: 0.97
平均测试损失 Loss: 0.11
测试时间: 0:02:24
训练一共用了 2:03:35 (h:mm:ss)
简单测试一下
_, logits = model(input_ids[50:60].to(device),token_type_ids=None,attention_mask=attention_masks[50:60].to(device),labels=labels[50:60].to(device))
logit = logits.detach().cpu().numpy()
label_id = labels[50:60].cpu().numpy()
attention_mask = attention_masks[50:60].cpu().numpy()
flat_accuracy(logit, label_id,attention_mask)
0.9776785714285714
pred_flat = (np.argmax(logit, axis=2)*attention_mask)
labels_flat = label_id
pred_flat[3]
array([10, 1, 9, 9, 5, 9, 9, 3, 9, 9, 9, 9, 6, 9, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0])
labels_flat[3]
array([10, 1, 9, 9, 15, 9, 9, 3, 9, 9, 9, 9, 6, 9, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0])
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0])