0. transformer模型结构
原论文地址:https://arxiv.org/pdf/1706.03762
模型结构如下:

原论文里面的attention计算公式如下:

为了实现基于pytorch的transformer 。首先配置环境,这里参考《动手学深度学习》真本书,推荐大家去看看~
ps:如果不按照这个版本可能会报错
conda create --name d2l python=3.9
conda activate d2l
pip install torch==1.12.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install torchvision==0.13.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install d2l==0.17.6 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install tensorboard -i https://pypi.tuna.tsinghua.edu.cn/simple
主要分为编码器和解码器结构
《动手学深度学习》里面给出例子是机器翻译,完整代码如下:
1. 基于transformer的机器翻译例子
import math
import torch
from torch import nn
from d2l import torch as d2l
# 基于位置的前馈网络 其实就是个MLP
#@save
class PositionWiseFFN(nn.Module):
"""基于位置的前馈网络"""
def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs,
**kwargs):
super(PositionWiseFFN, self).__init__(**kwargs)
self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
self.relu = nn.ReLU()
self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)
def forward(self, X):
return self.dense2(self.relu(self.dense1(X)))
#@save
class AddNorm(nn.Module):
"""残差连接后进行层规范化"""
def __init__(self, normalized_shape, dropout, **kwargs):
super(AddNorm, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
self.ln = nn.LayerNorm(normalized_shape)
def forward(self, X, Y):
return self.ln(self.dropout(Y) + X)
#@save
class EncoderBlock(nn.Module):
"""Transformer编码器块"""
def __init__(self, key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
dropout, use_bias=False, **kwargs):
super(EncoderBlock, self).__init__(**kwargs)
self.attention = d2l.MultiHeadAttention(
key_size, query_size, value_size, num_hiddens, num_heads, dropout,
use_bias)
self.addnorm1 = AddNorm(norm_shape, dropout)
self.ffn = PositionWiseFFN(
ffn_num_input, ffn_num_hiddens, num_hiddens)
self.addnorm2 = AddNorm(norm_shape, dropout)
def forward(self, X, valid_lens):
Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
return self.addnorm2(Y, self.ffn(Y))
#@save
class TransformerEncoder(d2l.Encoder):
"""Transformer编码器"""
def __init__(self, vocab_size, key_size, query_size, value_size,
num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, num_layers, dropout, use_bias=False, **kwargs):
super(TransformerEncoder, self).__init__(**kwargs)
self.num_hiddens = num_hiddens
self.embedding = nn.Embedding(vocab_size, num_hiddens)
self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
self.blks = nn.Sequential()
for i in range(num_layers):
self.blks.add_module("block"+str(i),
EncoderBlock(key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, dropout, use_bias))
def forward(self, X, valid_lens, *args):
# 因为位置编码值在-1和1之间,
# 因此嵌入值乘以嵌入维度的平方根进行缩放,
# 然后再与位置编码相加。
X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
self.attention_weights = [None] * len(self.blks)
for i, blk in enumerate(self.blks):
X = blk(X, valid_lens)
self.attention_weights[
i] = blk.attention.attention.attention_weights
return X
class DecoderBlock(nn.Module):
"""解码器中第i个块"""
def __init__(self, key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
dropout, i, **kwargs):
super(DecoderBlock, self).__init__(**kwargs)
self.i = i
self.attention1 = d2l.MultiHeadAttention(
key_size, query_size, value_size, num_hiddens, num_heads, dropout)
self.addnorm1 = AddNorm(norm_shape, dropout)
self.attention2 = d2l.MultiHeadAttention(
key_size, query_size, value_size, num_hiddens, num_heads, dropout)
self.addnorm2 = AddNorm(norm_shape, dropout)
self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens,
num_hiddens)
self.addnorm3 = AddNorm(norm_shape, dropout)
def forward(self, X, state):
enc_outputs, enc_valid_lens = state[0], state[1]
# 训练阶段,输出序列的所有词元都在同一时间处理,
# 因此state[2][self.i]初始化为None。
# 预测阶段,输出序列是通过词元一个接着一个解码的,
# 因此state[2][self.i]包含着直到当前时间步第i个块解码的输出表示
if state[2][self.i] is None:
key_values = X
else:
key_values = torch.cat((state[2][self.i], X), axis=1)
state[2][self.i] = key_values
if self.training:
batch_size, num_steps, _ = X.shape
# dec_valid_lens的开头:(batch_size,num_steps),
# 其中每一行是[1,2,...,num_steps]
dec_valid_lens = torch.arange(
1, num_steps + 1, device=X.device).repeat(batch_size, 1)
else:
dec_valid_lens = None
# 自注意力
X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
Y = self.addnorm1(X, X2)
# 编码器-解码器注意力。
# enc_outputs的开头:(batch_size,num_steps,num_hiddens)
Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
Z = self.addnorm2(Y, Y2)
return self.addnorm3(Z, self.ffn(Z)), state
class TransformerDecoder(d2l.AttentionDecoder):
def __init__(self, vocab_size, key_size, query_size, value_size,
num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, num_layers, dropout, **kwargs):
super(TransformerDecoder, self).__init__(**kwargs)
self.num_hiddens = num_hiddens
self.num_layers = num_layers
self.embedding = nn.Embedding(vocab_size, num_hiddens)
self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
self.blks = nn.Sequential()
for i in range(num_layers):
self.blks.add_module("block"+str(i),
DecoderBlock(key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, dropout, i))
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, enc_valid_lens, *args):
return [enc_outputs, enc_valid_lens, [None] * self.num_layers]
def forward(self, X, state):
X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
self._attention_weights = [[None] * len(self.blks) for _ in range (2)]
for i, blk in enumerate(self.blks):
X, state = blk(X, state)
# 解码器自注意力权重
self._attention_weights[0][
i] = blk.attention1.attention.attention_weights
# “编码器-解码器”自注意力权重
self._attention_weights[1][
i] = blk.attention2.attention.attention_weights
return self.dense(X), state
@property
def attention_weights(self):
return self._attention_weights
def main():
num_hiddens, num_layers, dropout, batch_size, num_steps = 32, 2, 0.1, 64, 10
lr, num_epochs, device = 0.005, 200, d2l.try_gpu()
ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 4
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]
train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps)
encoder = TransformerEncoder(
len(src_vocab), key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
num_layers, dropout)
decoder = TransformerDecoder(
len(tgt_vocab), key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
num_layers, dropout)
net = d2l.EncoderDecoder(encoder, decoder)
d2l.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
for eng, fra in zip(engs, fras):
translation, dec_attention_weight_seq = d2l.predict_seq2seq(
net, eng, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{eng} => {translation}, ',
f'bleu {d2l.bleu(translation, fra, k=2):.3f}')
if __name__ == '__main__':
main()
将d2l.train_seq2seq部分稍微改一下,输出loss:
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
"""Train a model for sequence to sequence.
Defined in :numref:`sec_seq2seq_decoder`"""
def xavier_init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
if type(m) == nn.GRU:
for param in m._flat_weights_names:
if "weight" in param:
nn.init.xavier_uniform_(m._parameters[param])
net.apply(xavier_init_weights)
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = MaskedSoftmaxCELoss()
net.train()
animator = d2l.Animator(xlabel='epoch', ylabel='loss',
xlim=[10, num_epochs])
for epoch in range(num_epochs):
timer = d2l.Timer()
metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens
for batch in data_iter:
optimizer.zero_grad()
X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
device=device).reshape(-1, 1)
dec_input = d2l.concat([bos, Y[:, :-1]], 1) # Teacher forcing
Y_hat, _ = net(X, dec_input, X_valid_len)
l = loss(Y_hat, Y, Y_valid_len)
l.sum().backward() # Make the loss scalar for `backward`
d2l.grad_clipping(net, 1)
num_tokens = Y_valid_len.sum()
optimizer.step()
with torch.no_grad():
metric.add(l.sum(), num_tokens)
if (epoch + 1) % 10 == 0:
# animator.add(epoch + 1, (metric[0] / metric[1],))
print(f'Epoch {epoch + 1} Loss: {metric[0] / metric[1]:.3f}') # 输出每一轮的损失数值
print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
f'tokens/sec on {str(device)}')
输出如下:
Epoch 10 Loss: 0.181
Epoch 20 Loss: 0.133
Epoch 30 Loss: 0.100
Epoch 40 Loss: 0.078
Epoch 50 Loss: 0.067
Epoch 60 Loss: 0.056
Epoch 70 Loss: 0.053
Epoch 80 Loss: 0.046
Epoch 90 Loss: 0.047
Epoch 100 Loss: 0.044
Epoch 110 Loss: 0.039
Epoch 120 Loss: 0.039
Epoch 130 Loss: 0.038
Epoch 140 Loss: 0.036
Epoch 150 Loss: 0.038
Epoch 160 Loss: 0.033
Epoch 170 Loss: 0.036
Epoch 180 Loss: 0.031
Epoch 190 Loss: 0.031
Epoch 200 Loss: 0.033
loss 0.033, 10446.4 tokens/sec on cpu
go . => va !, bleu 1.000
i lost . => je sais signe ., bleu 0.000
he's calm . => il est calme ., bleu 1.000
i'm home . => je suis chez moi ., bleu 1.000
2. 基于transformer编码器部分的手写体数字识别
这里面只使用transformer编码器部分,进行分类任务。
数据集使用MNSIT数据集,可以自行从pytorch的官方库下载。

参考上述思路,可以很容易编写下面代码:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
# 位置编码类
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return x
# Transformer编码器层类
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, src):
src2 = self.self_attn(src, src, src)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(nn.functional.relu(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
# Transformer编码器类
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers):
super(TransformerEncoder, self).__init__()
self.layers = [encoder_layer for _ in range(num_layers)]
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
# MNIST分类模型类
class MNISTTransformerClassifier(nn.Module):
def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, num_classes=10):
super(MNISTTransformerClassifier, self).__init__()
self.embedding = nn.Linear(28 * 28, d_model)
self.pos_encoder = PositionalEncoding(d_model)
encoder_layer = TransformerEncoderLayer(d_model, nhead)
self.transformer_encoder = TransformerEncoder(encoder_layer, num_encoder_layers)
self.fc = nn.Linear(d_model, num_classes)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.embedding(x)
x = self.pos_encoder(x)
x = self.transformer_encoder(x)
x = x.mean(dim=0)
x = self.fc(x)
return x
# 数据预处理
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# 加载训练集
train_dataset = MNIST(root='/content/drive/MyDrive', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# 加载测试集
test_dataset = MNIST(root='/content/drive/MyDrive', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# 选择设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 实例化模型、损失函数和优化器
model = MNISTTransformerClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.000001)
# 训练轮次
num_epochs = 5
for epoch in range(num_epochs):
running_loss = 0.0
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f'Epoch {epoch + 1}: Loss: {running_loss / len(train_loader)}')
当然,我们可以稍微改一下,我们可以使用tensorboard工具来完成可视化,具体使用方法感兴趣的可以自行查询。
完整代码如下:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
import math
import copy
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return x
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, src):
src2 = self.self_attn(src, src, src)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(nn.functional.relu(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers):
super(TransformerEncoder, self).__init__()
self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
class MNISTTransformerClassifier(nn.Module):
def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, num_classes=10):
super(MNISTTransformerClassifier, self).__init__()
self.embedding = nn.Linear(28 * 28, d_model)
self.pos_encoder = PositionalEncoding(d_model)
encoder_layer = TransformerEncoderLayer(d_model, nhead)
self.transformer_encoder = TransformerEncoder(encoder_layer, num_encoder_layers)
self.fc = nn.Linear(d_model, num_classes)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.embedding(x)
x = self.pos_encoder(x)
x = self.transformer_encoder(x)
x = x.mean(dim=0)
x = self.fc(x)
return x
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = MNIST(root='/content/drive/MyDrive', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = MNIST(root='/content/drive/MyDrive', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MNISTTransformerClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.000003)
writer = SummaryWriter('runs/mnist_transformer-3')
num_epochs = 10
for epoch in range(num_epochs):
running_loss = 0.0
correct_train = 0
total_train = 0
# 训练阶段
model.train() # 确保处于训练模式
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted_train = torch.max(outputs.data, 1)
total_train += labels.size(0)
correct_train += (predicted_train == labels).sum().item()
if (i + 1) % 200 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {running_loss / 100}')
writer.add_scalar('Training Loss', running_loss / 100, epoch * len(train_loader) + i)
running_loss = 0.0
train_accuracy = 100 * correct_train / total_train
print(f'Epoch {epoch + 1}: Training Accuracy: {train_accuracy}%')
writer.add_scalar('Training Accuracy', train_accuracy, epoch)
# 每个epoch结束后,在测试集上评估准确率
model.eval() # 设置为评估模式
correct_test = 0
total_test = 0
with torch.no_grad():
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted_test = torch.max(outputs.data, 1)
total_test += labels.size(0)
correct_test += (predicted_test == labels).sum().item()
test_accuracy = 100 * correct_test / total_test
print(f'Epoch {epoch + 1}: Test Accuracy: {test_accuracy}%')
writer.add_scalar('Test Accuracy', test_accuracy, epoch)
model.train() # 切换回训练模式
writer.close()
因为我这里使用的是google的colab gpu进行训练,所以输入就可以看到可视化界面了:
%load_ext tensorboard
%tensorboard --logdir=runs/mnist_transformer-3
输出为:
Epoch [1/10], Step [200/938], Loss: 3.664723302125931
Epoch [1/10], Step [400/938], Loss: 1.9123436504602431
Epoch [1/10], Step [600/938], Loss: 1.3068539306521416
Epoch [1/10], Step [800/938], Loss: 1.0478180265426635
Epoch 1: Training Accuracy: 74.92666666666666%
Epoch 1: Test Accuracy: 89.24%
Epoch [2/10], Step [200/938], Loss: 0.8359199245274067
Epoch [2/10], Step [400/938], Loss: 0.7377738726139068
Epoch [2/10], Step [600/938], Loss: 0.6923808173835277
Epoch [2/10], Step [800/938], Loss: 0.6495410591363907
Epoch 2: Training Accuracy: 90.20166666666667%
Epoch 2: Test Accuracy: 91.76%
Epoch [3/10], Step [200/938], Loss: 0.5807338447868824
Epoch [3/10], Step [400/938], Loss: 0.5677195289731025
Epoch [3/10], Step [600/938], Loss: 0.518595809713006
Epoch [3/10], Step [800/938], Loss: 0.5226419296115636
Epoch 3: Training Accuracy: 92.14333333333333%
Epoch 3: Test Accuracy: 92.99%
Epoch [4/10], Step [200/938], Loss: 0.4869861225038767
Epoch [4/10], Step [400/938], Loss: 0.46760185830295087
Epoch [4/10], Step [600/938], Loss: 0.43687341414391995
Epoch [4/10], Step [800/938], Loss: 0.4341541718691587
Epoch 4: Training Accuracy: 93.435%
Epoch 4: Test Accuracy: 93.95%
Epoch [5/10], Step [200/938], Loss: 0.3897417167946696
Epoch [5/10], Step [400/938], Loss: 0.4144923872128129
Epoch [5/10], Step [600/938], Loss: 0.3863081284239888
Epoch [5/10], Step [800/938], Loss: 0.36649829741567375
Epoch 5: Training Accuracy: 94.39333333333333%
Epoch 5: Test Accuracy: 94.41%
Epoch [6/10], Step [200/938], Loss: 0.3594508857280016
Epoch [6/10], Step [400/938], Loss: 0.3507236260548234
Epoch [6/10], Step [600/938], Loss: 0.3320340771973133
Epoch [6/10], Step [800/938], Loss: 0.3437390545755625
Epoch 6: Training Accuracy: 94.98666666666666%
Epoch 6: Test Accuracy: 95.12%
Epoch [7/10], Step [200/938], Loss: 0.3166021494567394
Epoch [7/10], Step [400/938], Loss: 0.3045706953108311
Epoch [7/10], Step [600/938], Loss: 0.29794814888387916
Epoch [7/10], Step [800/938], Loss: 0.29839959293603896
Epoch 7: Training Accuracy: 95.58166666666666%
Epoch 7: Test Accuracy: 95.39%
Epoch [8/10], Step [200/938], Loss: 0.27406183406710627
Epoch [8/10], Step [400/938], Loss: 0.2812281870096922
Epoch [8/10], Step [600/938], Loss: 0.25877184154465793
Epoch [8/10], Step [800/938], Loss: 0.2660152075439692
Epoch 8: Training Accuracy: 96.06666666666666%
Epoch 8: Test Accuracy: 95.88%
Epoch [9/10], Step [200/938], Loss: 0.2543461931310594
Epoch [9/10], Step [400/938], Loss: 0.25227617312222717
Epoch [9/10], Step [600/938], Loss: 0.24649917978793381
Epoch [9/10], Step [800/938], Loss: 0.24894069131463767
Epoch 9: Training Accuracy: 96.43833333333333%
Epoch 9: Test Accuracy: 96.04%
Epoch [10/10], Step [200/938], Loss: 0.24280567541718484
Epoch [10/10], Step [400/938], Loss: 0.23789384735748173
Epoch [10/10], Step [600/938], Loss: 0.21509589752182365
Epoch [10/10], Step [800/938], Loss: 0.20988688671961428
Epoch 10: Training Accuracy: 96.725%
Epoch 10: Test Accuracy: 96.31%
loss 和acc的截图如下,没怎么调参效过还不错。



有问题欢迎留言指出~
基于PyTorch的Transformer应用实例
6806

被折叠的 条评论
为什么被折叠?



