生物大分子平台(10)
2021SC@SDUSC
文章目录
0 本周工作
本周阅读完成了attention代码中的train部分。
1 train.py 部分代码解读
1.1 引入新库
import argparse
import math
import time
import dill as pickle
from tqdm import tqdm
import numpy as np
import random
import os
import torch
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, Dataset, BucketIterator
from torchtext.datasets import TranslationDataset
import transformer.Constants as Constants
from transformer.Models import Transformer
from transformer.Optim import ScheduledOptim
这里接触到的新库是torchtext,这个库中有许多的数据集以及构建好的词向量等。
1.2 对模型添加新参数
parser = argparse.ArgumentParser()
parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field
parser.add_argument('-train_path', default=None) # bpe encoded data
parser.add_argument('-val_path', default=None) # bpe encoded data
parser.add_argument('-epoch', type=int, default=10)
parser.add_argument('-b', '--batch_size', type=int, default=2048)
parser.add_argument('-d_model', type=int, default=512)
parser.add_argument('-d_inner_hid', type=int, default=2048)
parser.add_argument('-d_k', type=int, default=64)
parser.add_argument('-d_v', type=int, default=64)
parser.add_argument('-n_head', type=int, default=8)
parser.add_argument('-n_layers', type=int, default=6)
parser.add_argument('-warmup','--n_warmup_steps', type=int, default=4000)
parser.add_argument('-lr_mul', type=float, default=2.0)
parser.add_argument('-seed', type=int, default=None)
parser.add_ayrgument('-dropout', type=float, default=0.1)
parser.add_argument('-embs_share_weight', action='store_true')
parser.add_argument('-proj_share_weight', action='store_true')
parser.add_argument('-scale_emb_or_prj', type=str, default='prj')
parser.add_argument('-output_dir', type=str, default=None)
parser.add_argument('-use_tb', action='store_true')
parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')
parser.add_argument('-no_cuda', action='store_true')
parser.add_argument('-label_smoothing', action='store_true')
1.3 对参数进行赋值操作,以及cuda、词向量的相关操作
opt = parser.parse_args()
opt.cuda = not opt.no_cuda
opt.d_word_vec = opt.d_model
1.4 加载数据
if all((opt.train_path, opt.val_path)):
training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device)
elif opt.data_pkl:
training_data, validation_data = prepare_dataloaders(opt, device)
else:
raise
print(opt)
1.5 创建transformer
- 设置各种参数、src、k、v、d_inners、层数、多头注意力机制等。
transformer = Transformer(
opt.src_vocab_size,
opt.trg_vocab_size,
src_pad_idx=opt.src_pad_idx,
trg_pad_idx=opt.trg_pad_idx,
trg_emb_prj_weight_sharing=opt.proj_share_weight,
emb_src_trg_weight_sharing=opt.embs_share_weight,
d_k=opt.d_k,
d_v=opt.d_v,
d_model=opt.d_model,
d_word_vec=opt.d_word_vec,
d_inner=opt.d_inner_hid,
n_layers=opt.n_layers,
n_head=opt.n_head,
dropout=opt.dropout,
scale_emb_or_prj=opt.scale_emb_or_prj).to(device)
- optimizer 优化器
optimizer = ScheduledOptim(
optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
opt.lr_mul, opt.d_model, opt.n_warmup_steps)
- 开始训练
train(transformer, training_data, validation_data, optimizer, device, opt)
1.6 标签平滑操作
def cal_performance(pred, gold, trg_pad_idx, smoothing=False):
loss = cal_loss(pred, gold, trg_pad_idx, smoothing=smoothing)
pred = pred.max(1)[1]
gold = gold.contiguous().view(-1)
non_pad_mask = gold.ne(trg_pad_idx)
n_correct = pred.eq(gold).masked_select(non_pad_mask).sum().item()
n_word = non_pad_mask.sum().item()
return loss, n_correct, n_word
计算损失
- 计算交叉熵损失,必要时应用label平滑操作
- log_softmax是对softmax的结果取log获得的结果
def cal_loss(pred, gold, trg_pad_idx, smoothing=False):
gold = gold.contiguous().view(-1)
if smoothing:
eps = 0.1
n_class = pred.size(1)
one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
log_prb = F.log_softmax(pred, dim=1)
non_pad_mask = gold.ne(trg_pad_idx)
loss = -(one_hot * log_prb).sum(dim=1)
loss = loss.masked_select(non_pad_mask).sum() # average later
else:
loss = F.cross_entropy(pred, gold, ignore_index=trg_pad_idx, reduction='sum')
return loss
训练阶段的Epoch
def train_epoch(model, training_data, optimizer, opt, device, smoothing):
model.train()
total_loss, n_word_total, n_word_correct = 0, 0, 0
desc = ' - (Training) '
for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False):
# prepare data
src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))
# forward
optimizer.zero_grad()
pred = model(src_seq, trg_seq)
# backward and update parameters
loss, n_correct, n_word = cal_performance(
pred, gold, opt.trg_pad_idx, smoothing=smoothing)
loss.backward()
optimizer.step_and_update_lr()
# note keeping
n_word_total += n_word
n_word_correct += n_correct
total_loss += loss.item()
loss_per_word = total_loss/n_word_total
accuracy = n_word_correct/n_word_total
return loss_per_word, accuracy
2 下周打算
继续阅读attention的相关代码