继RNN建模SMILES进行反应产率预测后,尝试使用Transformer建模SMILES进行反应产率。Task3:Transformer建模SMILES进行反应产率预测
Transformer优势
-
并行处理能力:能够通过自注意力机制能够同时处理序列中的所有元素,而使得它在处理长序列数据时具有高并行性,提高了计算效率 。
-
避免梯度消失问题:使用残差连接来减轻深层网络训练中的梯度消失问题,可以构建更深的网络结构而不会损失性能
Transformer初步了解
基本框架了解
可以看到编码器(Encoder)和解码器(Decoder)为主要组成部分,且编码器和解码器都包含多个相同的层级结构。重要结构及功能:
- 自注意力机制(Self-Attention):允许模型在编码或解码时关注序列中的不同部分。其中自注意力是Q\K\V的值都是输入序列本身。
- 前馈网络(Feed Forward Network):对自注意力层的输出进行进一步的非线性变换。前馈层本质上是一个线性层
- 残差连接和层归一化(Add & LayerNorm):每个(LayerNorm layer)的(Add)均为残差连接。残差连接帮助解决网络训练中的梯度消失问题,层归一化则有助于稳定和加速训练过程。
实践
建立Transformer模型
# 模型
'''
直接采用一个transformer encoder model就好了
'''
class TransformerEncoderModel(nn.Module):
def __init__(self, input_dim, d_model, num_heads, fnn_dim, num_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, d_model)
self.layerNorm = nn.LayerNorm(d_model)
self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
nhead=num_heads,
dim_feedforward=fnn_dim,
dropout=dropout,
batch_first=True,
norm_first=True # pre-layernorm
)
self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer,
num_layers=num_layers,
norm=self.layerNorm)
self.dropout = nn.Dropout(dropout)
self.lc = nn.Sequential(nn.Linear(d_model, 256),
nn.Sigmoid(),
nn.Linear(256, 96),
nn.Sigmoid(),
nn.Linear(96, 1))
def forward(self, src):
# src shape: [batch_size, src_len]
embedded = self.dropout(self.embedding(src))
# embedded shape: [batch_size, src_len, d_model]
outputs = self.transformer_encoder(embedded)
# outputs shape: [batch_size, src_len, d_model]
# fisrt
z = outputs[:,0,:]
# z = torch.sum(outputs, dim=1)
# print(z)
# z shape: [bs, d_model]
outputs = self.lc(z)
# print(outputs)
# outputs shape: [bs, 1]
return outputs.squeeze(-1)
训练
# 训练
def train():
## super param
N = 10 #int / int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
INPUT_DIM = 292 # src length
D_MODEL = 512
NUM_HEADS = 4
FNN_DIM = 1024
NUM_LAYERS = 4
DROPOUT = 0.2
CLIP = 1 # CLIP value
N_EPOCHS = 40
LR = 1e-4
start_time = time.time() # 开始计时
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
data = read_data("../dataset/round1_train_data.csv")
dataset = ReactionDataset(data)
subset_indices = list(range(N))
subset_dataset = Subset(dataset, subset_indices)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT)
model = model.to(device)
model.train()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
criterion = nn.MSELoss()
best_valid_loss = 10
for epoch in range(N_EPOCHS):
epoch_loss = 0
# adjust_learning_rate(optimizer, epoch, LR) # 动态调整学习率
for i, (src, y) in enumerate(train_loader):
src, y = src.to(device), y.to(device)
optimizer.zero_grad()
output = model(src)
loss = criterion(output, y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
optimizer.step()
epoch_loss += loss.detach().item()
if i % 50 == 0:
print(f'Step: {i} | Train Loss: {epoch_loss:.4f}')
loss_in_a_epoch = epoch_loss / len(train_loader)
scheduler.step(loss_in_a_epoch)
print(f'Epoch: {epoch+1:02} | Train Loss: {loss_in_a_epoch:.3f}')
if loss_in_a_epoch < best_valid_loss:
best_valid_loss = loss_in_a_epoch
# 在训练循环结束后保存模型
torch.save(model.state_dict(), '../model/transformer.pth')
end_time = time.time() # 结束计时
# 计算并打印运行时间
elapsed_time_minute = (end_time - start_time)/60
print(f"Total running time: {elapsed_time_minute:.2f} minutes")
if __name__ == '__main__':
train()
尝试修改N_EPOCHS 参数尝试优化
修改错误代码
# 训练
def train():
## super param
N = 10 #int / int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
INPUT_DIM = 292 # src length
D_MODEL = 512
NUM_HEADS = 4
FNN_DIM = 1024
NUM_LAYERS = 4
DROPOUT = 0.2
CLIP = 1 # CLIP value
N_EPOCHS = 40
LR = 1e-4
start_time = time.time() # 开始计时
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
data = read_data("../dataset/round1_train_data.csv")
dataset = ReactionDataset(data)
subset_indices = list(range(N))
subset_dataset = Subset(dataset, subset_indices)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT)
model = model.to(device)
model.train()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
criterion = nn.MSELoss()
best_valid_loss = 10
for epoch in range(N_EPOCHS):
epoch_loss = 0
# adjust_learning_rate(optimizer, epoch, LR) # 动态调整学习率
for i, (src, y) in enumerate(train_loader):
src, y = src.to(device), y.to(device)
optimizer.zero_grad()
output = model(src)
loss = criterion(output, y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
optimizer.step()
epoch_loss += loss.detach().item()
if i % 50 == 0:
print(f'Step: {i} | Train Loss: {epoch_loss:.4f}')
scheduler.step(loss_in_a_epoch)
loss_in_a_epoch = epoch_loss / len(train_loader)
print(f'Epoch: {epoch+1:02} | Train Loss: {loss_in_a_epoch:.3f}')
if loss_in_a_epoch < best_valid_loss:
best_valid_loss = loss_in_a_epoch
# 在训练循环结束后保存模型
torch.save(model.state_dict(), '../model/transformer.pth')
end_time = time.time() # 结束计时
# 计算并打印运行时间
elapsed_time_minute = (end_time - start_time)/60
print(f"Total running time: {elapsed_time_minute:.2f} minutes")
if __name__ == '__main__':
train()
scheduler.step(loss_in_a_epoch)
loss_in_a_epoch = epoch_loss / len(train_loader)
变量loss_in_a_epoch
没有被定义。尝试在scheduler.step
函数中使用它,但是它还没有赋值,导致了UnboundLocalError
。