bert命名实体识别

导入命名实体
import torch
import pandas as pd
import numpy as np
path = './'
comments = pd.read_csv(path + '英文命名实体信息.csv', encoding="latin1").fillna(method="ffill")
print('命名实体总数:%d' % comments.shape[0])
Tags = list(set(comments['Tag']))

for tag in Tags: 
    print('命名实体({}):{}'.format(tag,  comments[comments.Tag==tag].shape[0]))
命名实体总数:1048575
命名实体(B-per):16990
命名实体(B-geo):37644
命名实体(I-gpe):198
命名实体(I-geo):7414
命名实体(I-org):16784
命名实体(B-org):20143
命名实体(B-tim):20333
命名实体(B-art):402
命名实体(I-per):17251
命名实体(I-tim):6528
命名实体(B-nat):201
命名实体(B-eve):308
命名实体(I-nat):51
命名实体(I-eve):253
命名实体(B-gpe):15870
命名实体(O):887908
命名实体(I-art):297
SentenceMaking 把实体组成句子
class SentenceMaking(object):
    
    def __init__(self, data):
      self.n_sent = 1
      self.data = data
      self.empty = False
      agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),s["POS"].values.tolist(),s["Tag"].values.tolist())]
      self.grouped = self.data.groupby("Sentence #").apply(agg_func)
      self.sentences = [s for s in self.grouped]
    
group_sentences = SentenceMaking(comments)
sentences = [" ".join([s[0] for s in sent]) for sent in group_sentences.sentences]
tags = [[s[2] for s in sent] for sent in group_sentences.sentences]
# tag to id
tag2idx = {t: i for i, t in enumerate(Tags)}
# 标记所有实体名称
labels = list(tag2idx.values())
labels
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
简单查看一下句子
sentences[0:5]
['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
 'Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .',
 'Helicopter gunships Saturday pounded militant hideouts in the Orakzai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South Waziristan .',
 'They left after a tense hour-long standoff with riot police .',
 'U.N. relief coordinator Jan Egeland said Sunday , U.S. , Indonesian and Australian military helicopters are ferrying out food and supplies to remote areas of western Aceh province that ground crews can not reach .']
最长句子的长度为 541
max_len = 0
lengthOfsentence = []
# 循环每一个句子...
for sent in sentences:

    lengthOfsentence.append(len(sent))
    # 找到句子最大长度
    max_len = max(max_len, len(sent))

print('最长的句子长度为: ', max_len)
最长的句子长度为:  541
根据观察,大多数句子长度在250 到300,padding 时候的max_length 我们取256
import matplotlib.pyplot as plt
plt.plot(lengthOfsentence)
plt.ylabel('some numbers')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RoATBnSA-1591968778062)(bert%E5%91%BD%E5%90%8D%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%AB_files/bert%E5%91%BD%E5%90%8D%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%AB_9_0.png)]

下载 BERT tokenizer.
from transformers import BertTokenizer
print('下载 BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
下载 BERT tokenizer...
为句子中词的tag加上padding
tags_masks = []
for tag in tags:
    tag_id = np.zeros(256)
    for i,item in enumerate(tag):
        tag_id[i]= tag2idx[item]
    tags_masks.append(tag_id)
tags_masks = torch.tensor(tags_masks,dtype=torch.int64)
tags_masks[0]
tensor([15, 15, 15, 15, 15, 15,  1, 15, 15, 15, 15, 15,  1, 15, 15, 15, 15, 15,
        14, 15, 15, 15, 15, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0])
bert模型输入input_ids,attention_masks 和 labels
input_ids = []
attention_masks = []

for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # 把编码的句子加入list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # 加上 attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
    
# 把lists 转为 tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = tags_masks
设计 training,validation 和 test dataset
from torch.utils.data import TensorDataset, random_split

# 把input 放入 TensorDataset。
dataset = TensorDataset(input_ids, attention_masks, labels)

# 计算 train_size 和 val_size 的长度.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# 90% 的dataset 为train_dataset, 10% 的的dataset 为val_dataset.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} 训练数据'.format(train_size))
print('{:>5,} 验证数据'.format(val_size))
43,163 训练数据
4,796 验证数据
制作dataloader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# 推荐batch_size 为 16 或者 32
batch_size = 16

# 为训练数据集和验证数据集设计DataLoaders. 
train_dataloader = DataLoader(
            train_dataset,  # 训练数据.
            sampler = RandomSampler(train_dataset), # 打乱顺序
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, # 验证数据.
            sampler = RandomSampler(val_dataset), # 打乱顺序
            batch_size = batch_size 
        )

导入 bert 命名实体模型 BertForTokenClassification
from transformers import BertForTokenClassification, AdamW, BertConfig

model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased", # 使用 12-layer 的 BERT 模型.
    num_labels = len(tag2idx), # 多分类任务的输出标签为 len(tag2idx)个.                     
    output_attentions = False, # 不返回 attentions weights.
    output_hidden_states = False, # 不返回 all hidden-states.
)
model.cuda()
BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (2): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (3): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (4): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (5): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (6): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (7): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (8): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (9): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (10): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (11): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=17, bias=True)
)
选择优化器
# AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - 默认是 5e-5
                  eps = 1e-8 # args.adam_epsilon  - 默认是 1e-8, 是为了防止衰减率分母除到0
                )
设计learning rate scheduler, 调整learning rate.
from transformers import get_linear_schedule_with_warmup

# bert 推荐 epochs 在2到4之间为好。
epochs = 2

# training steps 的数量: [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
flat_accuracy 计算模型准确率
def flat_accuracy(preds, labels,attention):
    pred_flat = (np.argmax(preds, axis=2)*attention).flatten()
    labels_flat = labels.flatten()
    atten = attention.flatten()
    return np.sum((pred_flat == labels_flat)*atten)/ np.sum(atten)
format_time 计算所用时间
import time
import datetime
def format_time(elapsed):    
    elapsed_rounded = int(round((elapsed)))    
    # 返回 hh:mm:ss 形式的时间
    return str(datetime.timedelta(seconds=elapsed_rounded))
训练模型
import os
import random
import numpy as np
from transformers import WEIGHTS_NAME, CONFIG_NAME

output_dir = './'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# 代码参考 https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# 设置随机种子.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 记录training ,validation loss ,validation accuracy and timings.
training_stats = []

# 设置总时间.
total_t0 = time.time()
best_val_accuracy = 0

for epoch_i in range(0, epochs):      
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))  

    # 记录每个 epoch 所用的时间
    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()
  
    for step, batch in enumerate(train_dataloader):

        # 每隔40个batch 输出一下所用时间.
        if step % 40 == 0 and not step == 0:            
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` 包括3个 tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # 清空梯度
        model.zero_grad()        

        # forward        
        # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
       
        total_train_loss += loss.item()

        # backward 更新 gradients.
        loss.backward()

        # 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 更新模型参数 
        optimizer.step()
       
        # 更新 learning rate.
        scheduler.step()        
             
        logit = logits.detach().cpu().numpy()
        label_id = b_labels.to('cpu').numpy()      
        attention_mask = b_input_mask.cpu().numpy()
        
        # 计算training 句子的准确度.
        total_train_accuracy += flat_accuracy(logit, label_id,attention_mask)
     
    # 计算batches的平均损失.
    avg_train_loss = total_train_loss / len(train_dataloader)      
    # 计算训练时间.
    training_time = format_time(time.time() - t0)
    
    # 训练集的准确率.
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print("  训练准确率: {0:.2f}".format(avg_train_accuracy))
    print("  平均训练损失 loss: {0:.2f}".format(avg_train_loss))
    print("  训练时间: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    t0 = time.time()

    # 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同
    model.eval()

    # 设置参数
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:        

        # `batch` 包括3个 tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)        

        # 在valuation 状态,不更新权值,不改变计算图
        with torch.no_grad():        

            # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # 计算 validation loss.
        total_eval_loss += loss.item()        
        logit = logits.detach().cpu().numpy()
        label_id = b_labels.to('cpu').numpy()      
        attention_mask = b_input_mask.cpu().numpy()

        # 计算 validation 句子的准确度.
        total_eval_accuracy += flat_accuracy(logit, label_id,attention_mask)
        
    # 计算 validation 的准确率.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("")
    print("  测试准确率: {0:.2f}".format(avg_val_accuracy))
    
    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        torch.save(model.state_dict(),output_model_file)
        model.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(output_dir)
         

    # 计算batches的平均损失.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # 计算validation 时间.
    validation_time = format_time(time.time() - t0)
    
    print("  平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
    print("  测试时间: {:}".format(validation_time))

    # 记录模型参数
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
Epoch 1 / 2
  Batch    40  of  2,698.    Elapsed: 0:00:53.
  Batch    80  of  2,698.    Elapsed: 0:01:46.
  Batch   120  of  2,698.    Elapsed: 0:02:39.
  Batch   160  of  2,698.    Elapsed: 0:03:32.
  Batch   200  of  2,698.    Elapsed: 0:04:24.
  Batch   240  of  2,698.    Elapsed: 0:05:17.
  Batch   280  of  2,698.    Elapsed: 0:06:10.
  Batch   320  of  2,698.    Elapsed: 0:07:03.
  Batch   360  of  2,698.    Elapsed: 0:07:56.
  Batch   400  of  2,698.    Elapsed: 0:08:48.
  Batch   440  of  2,698.    Elapsed: 0:09:41.
  Batch   480  of  2,698.    Elapsed: 0:10:34.
  Batch   520  of  2,698.    Elapsed: 0:11:27.
  Batch   560  of  2,698.    Elapsed: 0:12:20.
  Batch   600  of  2,698.    Elapsed: 0:13:13.
  Batch   640  of  2,698.    Elapsed: 0:14:05.
  Batch   680  of  2,698.    Elapsed: 0:14:58.
  Batch   720  of  2,698.    Elapsed: 0:15:51.
  Batch   760  of  2,698.    Elapsed: 0:16:44.
  Batch   800  of  2,698.    Elapsed: 0:17:37.
  Batch   840  of  2,698.    Elapsed: 0:18:30.
  Batch   880  of  2,698.    Elapsed: 0:19:22.
  Batch   920  of  2,698.    Elapsed: 0:20:15.
  Batch   960  of  2,698.    Elapsed: 0:21:08.
  Batch 1,000  of  2,698.    Elapsed: 0:22:01.
  Batch 1,040  of  2,698.    Elapsed: 0:22:54.
  Batch 1,080  of  2,698.    Elapsed: 0:23:47.
  Batch 1,120  of  2,698.    Elapsed: 0:24:40.
  Batch 1,160  of  2,698.    Elapsed: 0:25:33.
  Batch 1,200  of  2,698.    Elapsed: 0:26:26.
  Batch 1,240  of  2,698.    Elapsed: 0:27:18.
  Batch 1,280  of  2,698.    Elapsed: 0:28:11.
  Batch 1,320  of  2,698.    Elapsed: 0:29:04.
  Batch 1,360  of  2,698.    Elapsed: 0:29:57.
  Batch 1,400  of  2,698.    Elapsed: 0:30:50.
  Batch 1,440  of  2,698.    Elapsed: 0:31:43.
  Batch 1,480  of  2,698.    Elapsed: 0:32:36.
  Batch 1,520  of  2,698.    Elapsed: 0:33:28.
  Batch 1,560  of  2,698.    Elapsed: 0:34:21.
  Batch 1,600  of  2,698.    Elapsed: 0:35:14.
  Batch 1,640  of  2,698.    Elapsed: 0:36:07.
  Batch 1,680  of  2,698.    Elapsed: 0:37:00.
  Batch 1,720  of  2,698.    Elapsed: 0:37:52.
  Batch 1,760  of  2,698.    Elapsed: 0:38:45.
  Batch 1,800  of  2,698.    Elapsed: 0:39:38.
  Batch 1,840  of  2,698.    Elapsed: 0:40:31.
  Batch 1,880  of  2,698.    Elapsed: 0:41:24.
  Batch 1,920  of  2,698.    Elapsed: 0:42:17.
  Batch 1,960  of  2,698.    Elapsed: 0:43:09.
  Batch 2,000  of  2,698.    Elapsed: 0:44:02.
  Batch 2,040  of  2,698.    Elapsed: 0:44:55.
  Batch 2,080  of  2,698.    Elapsed: 0:45:48.
  Batch 2,120  of  2,698.    Elapsed: 0:46:41.
  Batch 2,160  of  2,698.    Elapsed: 0:47:34.
  Batch 2,200  of  2,698.    Elapsed: 0:48:27.
  Batch 2,240  of  2,698.    Elapsed: 0:49:19.
  Batch 2,280  of  2,698.    Elapsed: 0:50:12.
  Batch 2,320  of  2,698.    Elapsed: 0:51:05.
  Batch 2,360  of  2,698.    Elapsed: 0:51:58.
  Batch 2,400  of  2,698.    Elapsed: 0:52:51.
  Batch 2,440  of  2,698.    Elapsed: 0:53:43.
  Batch 2,480  of  2,698.    Elapsed: 0:54:36.
  Batch 2,520  of  2,698.    Elapsed: 0:55:29.
  Batch 2,560  of  2,698.    Elapsed: 0:56:22.
  Batch 2,600  of  2,698.    Elapsed: 0:57:15.
  Batch 2,640  of  2,698.    Elapsed: 0:58:08.
  Batch 2,680  of  2,698.    Elapsed: 0:59:01.
  训练准确率: 0.95
  平均训练损失 loss: 0.16
  训练时间: 0:59:24

  测试准确率: 0.96
  平均测试损失 Loss: 0.14
  测试时间: 0:02:24
Epoch 2 / 2
  Batch    40  of  2,698.    Elapsed: 0:00:53.
  Batch    80  of  2,698.    Elapsed: 0:01:46.
  Batch   120  of  2,698.    Elapsed: 0:02:39.
  Batch   160  of  2,698.    Elapsed: 0:03:31.
  Batch   200  of  2,698.    Elapsed: 0:04:24.
  Batch   240  of  2,698.    Elapsed: 0:05:17.
  Batch   280  of  2,698.    Elapsed: 0:06:10.
  Batch   320  of  2,698.    Elapsed: 0:07:03.
  Batch   360  of  2,698.    Elapsed: 0:07:56.
  Batch   400  of  2,698.    Elapsed: 0:08:49.
  Batch   440  of  2,698.    Elapsed: 0:09:42.
  Batch   480  of  2,698.    Elapsed: 0:10:35.
  Batch   520  of  2,698.    Elapsed: 0:11:28.
  Batch   560  of  2,698.    Elapsed: 0:12:20.
  Batch   600  of  2,698.    Elapsed: 0:13:13.
  Batch   640  of  2,698.    Elapsed: 0:14:06.
  Batch   680  of  2,698.    Elapsed: 0:14:59.
  Batch   720  of  2,698.    Elapsed: 0:15:52.
  Batch   760  of  2,698.    Elapsed: 0:16:45.
  Batch   800  of  2,698.    Elapsed: 0:17:37.
  Batch   840  of  2,698.    Elapsed: 0:18:30.
  Batch   880  of  2,698.    Elapsed: 0:19:23.
  Batch   920  of  2,698.    Elapsed: 0:20:16.
  Batch   960  of  2,698.    Elapsed: 0:21:09.
  Batch 1,000  of  2,698.    Elapsed: 0:22:02.
  Batch 1,040  of  2,698.    Elapsed: 0:22:54.
  Batch 1,080  of  2,698.    Elapsed: 0:23:47.
  Batch 1,120  of  2,698.    Elapsed: 0:24:40.
  Batch 1,160  of  2,698.    Elapsed: 0:25:33.
  Batch 1,200  of  2,698.    Elapsed: 0:26:26.
  Batch 1,240  of  2,698.    Elapsed: 0:27:18.
  Batch 1,280  of  2,698.    Elapsed: 0:28:11.
  Batch 1,320  of  2,698.    Elapsed: 0:29:04.
  Batch 1,360  of  2,698.    Elapsed: 0:29:57.
  Batch 1,400  of  2,698.    Elapsed: 0:30:50.
  Batch 1,440  of  2,698.    Elapsed: 0:31:42.
  Batch 1,480  of  2,698.    Elapsed: 0:32:35.
  Batch 1,520  of  2,698.    Elapsed: 0:33:28.
  Batch 1,560  of  2,698.    Elapsed: 0:34:21.
  Batch 1,600  of  2,698.    Elapsed: 0:35:14.
  Batch 1,640  of  2,698.    Elapsed: 0:36:07.
  Batch 1,680  of  2,698.    Elapsed: 0:36:59.
  Batch 1,720  of  2,698.    Elapsed: 0:37:52.
  Batch 1,760  of  2,698.    Elapsed: 0:38:45.
  Batch 1,800  of  2,698.    Elapsed: 0:39:38.
  Batch 1,840  of  2,698.    Elapsed: 0:40:31.
  Batch 1,880  of  2,698.    Elapsed: 0:41:24.
  Batch 1,920  of  2,698.    Elapsed: 0:42:17.
  Batch 1,960  of  2,698.    Elapsed: 0:43:09.
  Batch 2,000  of  2,698.    Elapsed: 0:44:02.
  Batch 2,040  of  2,698.    Elapsed: 0:44:55.
  Batch 2,080  of  2,698.    Elapsed: 0:45:48.
  Batch 2,120  of  2,698.    Elapsed: 0:46:41.
  Batch 2,160  of  2,698.    Elapsed: 0:47:34.
  Batch 2,200  of  2,698.    Elapsed: 0:48:27.
  Batch 2,240  of  2,698.    Elapsed: 0:49:19.
  Batch 2,280  of  2,698.    Elapsed: 0:50:12.
  Batch 2,320  of  2,698.    Elapsed: 0:51:05.
  Batch 2,360  of  2,698.    Elapsed: 0:51:58.
  Batch 2,400  of  2,698.    Elapsed: 0:52:51.
  Batch 2,440  of  2,698.    Elapsed: 0:53:44.
  Batch 2,480  of  2,698.    Elapsed: 0:54:37.
  Batch 2,520  of  2,698.    Elapsed: 0:55:29.
  Batch 2,560  of  2,698.    Elapsed: 0:56:22.
  Batch 2,600  of  2,698.    Elapsed: 0:57:15.
  Batch 2,640  of  2,698.    Elapsed: 0:58:08.
  Batch 2,680  of  2,698.    Elapsed: 0:59:01.
  训练准确率: 0.96
  平均训练损失 loss: 0.12
  训练时间: 0:59:24

  测试准确率: 0.97
  平均测试损失 Loss: 0.11
  测试时间: 0:02:24
训练一共用了 2:03:35 (h:mm:ss)
简单测试一下
_, logits = model(input_ids[50:60].to(device),token_type_ids=None,attention_mask=attention_masks[50:60].to(device),labels=labels[50:60].to(device))
logit = logits.detach().cpu().numpy()
label_id = labels[50:60].cpu().numpy()
attention_mask = attention_masks[50:60].cpu().numpy()
flat_accuracy(logit, label_id,attention_mask)
0.9776785714285714
pred_flat = (np.argmax(logit, axis=2)*attention_mask)
labels_flat = label_id
pred_flat[3]
array([10,  1,  9,  9,  5,  9,  9,  3,  9,  9,  9,  9,  6,  9,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0])
labels_flat[3]
array([10,  1,  9,  9, 15,  9,  9,  3,  9,  9,  9,  9,  6,  9,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0])

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0])

  • 3
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值