使用RNN模型预测化学反应的速率
class RNNModel(nn.Module):
def __init__(self, num_embed, input_size, hidden_size, output_size, num_layers, dropout, device):
super(RNNModel, self).__init__()
self.embed = nn.Embedding(num_embed, input_size)
self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers,
batch_first=True, dropout=dropout, bidirectional=True)
self.fc = nn.Sequential(nn.Linear(2 * num_layers * hidden_size, output_size),
nn.Sigmoid(),
nn.Linear(output_size, 1),
nn.Sigmoid())
def forward(self, x):
# x : [bs, seq_len]
x = self.embed(x)
# x : [bs, seq_len, input_size]
_, hn = self.rnn(x) # hn : [2*num_layers, bs, h_dim]
hn = hn.transpose(0,1)
z = hn.reshape(hn.shape[0], -1) # z shape: [bs, 2*num_layers*h_dim]
output = self.fc(z).squeeze(-1) # output shape: [bs, 1]
return output
首先定义一个RNN模型,值得注意的是这里使用了双向RNN,最后将两个方向的序列拼接起来。
接着对数据进行预处理
对SMILES 字符串(一种用于表示化学结构的字符串)进行分词(tokenize),并将分词结果转换为索引,以便用于机器学习模型。
def _pad_token_to_idx(self, tokens):
idx_list = []
for token in tokens:
tokens_idx = []
for i in token:
if i in self.vocab_dic.keys():
tokens_idx.append(self.vocab_dic[i])
else:
self.vocab_dic[i] = max(self.vocab_dic.values()) + 1
tokens_idx.append(self.vocab_dic[i])
idx_list.append(tokens_idx)
return idx_list
为了防止出现词汇表以外的名词,对其进行遍历添加进词汇表。
然后对CSV文件进行处理,将各种数据进行拼接,形成化学方程式的形式,返回一个列表。
def read_data(file_path, train=True):
df = pd.read_csv(file_path)
reactant1 = df["Reactant1"].tolist()
reactant2 = df["Reactant2"].tolist()
product = df["Product"].tolist()
additive = df["Additive"].tolist()
solvent = df["Solvent"].tolist()
if train:
react_yield = df["Yield"].tolist()
else:
react_yield = [0 for i in range(len(reactant1))]
# 将reactant拼到一起,之间用.分开。product也拼到一起,用>分开
input_data_list = []
for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
input_info = ".".join([react1, react2])
input_info = ">".join([input_info, prod])
input_data_list.append(input_info)
output = [(react, y) for react, y in zip(input_data_list, react_yield)]
之后对数据进行处理,返回torch的张量
之后的代码主要是关于使用PyTorch进行深度学习模型训练的流程。
加载最佳模型,生成结果文件
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
data = read_data("../dataset/round1_train_data.csv")
dataset = ReactionDataset(data)
subset_indices = list(range(N))
subset_dataset = Subset(dataset, subset_indices)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
model = RNNModel(NUM_EMBED, INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, DROPOUT, device).to(device)
model.train()
optimizer = optim.Adam(model.parameters(), lr=LR)
# criterion = nn.MSELoss() # MSE
criterion = nn.L1Loss() # MAE
best_loss = 10
for epoch in range(N_EPOCHS):
epoch_loss = 0
for i, (src, y) in enumerate(train_loader):
src, y = src.to(device), y.to(device)
optimizer.zero_grad()
output = model(src)
loss = criterion(output, y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
optimizer.step()
epoch_loss += loss.item()
loss_in_a_epoch = epoch_loss / len(train_loader)
print(f'Epoch: {epoch+1:02} | Train Loss: {loss_in_a_epoch:.3f}')
if loss_in_a_epoch < best_loss:
# 在训练循环结束后保存模型
torch.save(model.state_dict(), '../model/RNN.pth')
end_time = time.time() # 结束计时
# 计算并打印运行时间
elapsed_time_minute = (end_time - start_time)/60
print(f"Total running time: {elapsed_time_minute:.2f} minutes")
总结:
最终效果没有随机森林那么好,具体原因有待读者继续学习