在代码运行之前先导入所需要的库。
import pandas as pd
from torch.utils.data import Dataset, DataLoader, Subset
from typing import List, Tuple
import re
import torch
import torch.nn as nn
import time
import torch.optim as optim
然后根据SMILES特征,将读取到的化学反应方程式按字符拆分,替换为SMILES词汇表中的序号,这样就是对化学方程进行一个有效建模的过程,将其转化为计算机可使用的,可以拿来进行训练的数据。
# tokenizer,鉴于SMILES的特性,这里需要自己定义tokenizer和vocab
# 这里直接将smiles str按字符拆分,并替换为词汇表中的序号
class Smiles_tokenizer():
def __init__(self, pad_token, regex, vocab_file, max_length):
self.pad_token = pad_token
self.regex = regex
self.vocab_file = vocab_file
self.max_length = max_length
with open(self.vocab_file, "r") as f:
lines = f.readlines()
lines = [line.strip("\n") for line in lines]
vocab_dic = {}
for index, token in enumerate(lines):
vocab_dic[token] = index
self.vocab_dic = vocab_dic
def _regex_match(self, smiles):
regex_string = r"(" + self.regex + r"|"
regex_string += r".)"
prog = re.compile(regex_string)
tokenised = []
for smi in smiles:
tokens = prog.findall(smi)
if len(tokens) > self.max_length:
tokens = tokens[:self.max_length]
tokenised.append(tokens) # 返回一个所有的字符串列表
return tokenised
def tokenize(self, smiles):
tokens = self._regex_match(smiles)
# 添加上表示开始和结束的token:<cls>, <end>
tokens = [["<CLS>"] + token + ["<SEP>"] for token in tokens]
tokens = self._pad_seqs(tokens, self.pad_token)
token_idx = self._pad_token_to_idx(tokens)
return tokens, token_idx
def _pad_seqs(self, seqs, pad_token):
pad_length = max([len(seq) for seq in seqs])
padded = [seq + ([pad_token] * (pad_length - len(seq))) for seq in seqs]
return padded
def _pad_token_to_idx(self, tokens):
idx_list = []
new_vocab = []
for token in tokens:
tokens_idx = []
for i in token:
if i in self.vocab_dic.keys():
tokens_idx.append(self.vocab_dic[i])
else:
new_vocab.append(i)
self.vocab_dic[i] = max(self.vocab_dic.values()) + 1
tokens_idx.append(self.vocab_dic[i])
idx_list.append(tokens_idx)
with open("../new_vocab_list.txt", "a") as f:
for i in new_vocab:
f.write(i)
f.write("\n")
return idx_list
def _save_vocab(self, vocab_path):
with open(vocab_path, "w") as f:
for i in self.vocab_dic.keys():
f.write(i)
f.write("\n")
print("update new vocab!")
接下来的函数是读取csv文件,将文件中的两个反应物,产物,添加剂和溶剂分别提取出来,在这里我将溶剂、添加剂和两个反应物以符号“.”连接,由于我不是很清楚在SMILES中添加剂于溶剂究竟是使用哪种符号来表示的,也不清楚其在化学反应方程式中的顺序位置,所以连续换了好几个符号,比如“+”、“-”、“?”等,但是都报错了,最终只有符号“.”不会报错。
# 处理数据
def read_data(file_path, train=True):
df = pd.read_csv(file_path)
reactant1 = df["Reactant1"].tolist()
reactant2 = df["Reactant2"].tolist()
product = df["Product"].tolist()
additive = df["Additive"].tolist()
solvent = df["Solvent"].tolist()
if train:
react_yield = df["Yield"].tolist()
else:
react_yield = [0 for i in range(len(reactant1))]
# 将reactant\additive\solvent拼到一起,之间用.分开。product也拼到一起,用>>分开
input_data_list = []
for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
# input_info = ".".join([react1, react2, addi, sol])
input_info = ".".join([addi, sol])
input_info = ".".join([input_info, react1])
input_info = ".".join([input_info, react2])
input_info = ">".join([input_info, prod])
input_data_list.append(input_info)
output = [(react, y) for react, y in zip(input_data_list, react_yield)]
return output
再是来定义了一下数据集,我之前就是根据collate_fn函数中的REGEX来添加溶剂和添加剂的,但是看到collate_fn又是另一个函数的参数时,我又看不怎么懂了,只好赌一把,但可惜赌输了(赌狗都没有好下场)
# 定义数据集
class ReactionDataset(Dataset):
def __init__(self, data: List[Tuple[List[str], float]]):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def collate_fn(batch):
REGEX = r"\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
tokenizer = Smiles_tokenizer("<PAD>", REGEX, "../vocab_full.txt", 300)
smi_list = []
yield_list = []
for i in batch:
smi_list.append(i[0])
yield_list.append(i[1])
tokenizer_batch = torch.tensor(tokenizer.tokenize(smi_list)[1])
yield_list = torch.tensor(yield_list)
return tokenizer_batch, yield_list
这里就是构建transformer的类了,也就是神经网络构建部分,不知道为什么我看到Sigmid就想换成relu函数,有关数学理论的部分还是太迷糊了,我只知道这里肯定不能改。
# 模型
'''
直接采用一个transformer encoder model就好了
'''
class TransformerEncoderModel(nn.Module):
def __init__(self, input_dim, d_model, num_heads, fnn_dim, num_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, d_model)
self.layerNorm = nn.LayerNorm(d_model)
self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
nhead=num_heads,
dim_feedforward=fnn_dim,
dropout=dropout,
batch_first=True,
norm_first=True # pre-layernorm
)
self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer,
num_layers=num_layers,
norm=self.layerNorm)
self.dropout = nn.Dropout(dropout)
self.lc = nn.Sequential(nn.Linear(d_model, 256),
nn.Sigmoid(),
nn.Linear(256, 96),
nn.Sigmoid(),
nn.Linear(96, 1))
def forward(self, src):
# src shape: [batch_size, src_len]
embedded = self.dropout(self.embedding(src))
# embedded shape: [batch_size, src_len, d_model]
outputs = self.transformer_encoder(embedded)
# outputs shape: [batch_size, src_len, d_model]
# fisrt
z = outputs[:,0,:]
# z = torch.sum(outputs, dim=1)
# print(z)
# z shape: [bs, d_model]
outputs = self.lc(z)
# print(outputs)
# outputs shape: [bs, 1]
return outputs.squeeze(-1)
def adjust_learning_rate(optimizer, epoch, start_lr):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = start_lr * (0.1 ** (epoch // 3))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
然后是训练的部分,这里我尝试将学习率调到0.000001过,但是损失并没有得到一个很好的下降,我还试着将模型层数添加到15层,很可惜也没有用,dropout调成0就单纯想试一下损失很低的时候能低成什么样子。其实我一直也没有很懂前面那几个参数是用来干嘛的,一直都只去调了一下层数,dropout,轮数和学习率,整个代码不敢乱动,主要还是专业知识不够,没那个能力。
# 训练
def train():
## super param
N = 10 #int / int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
INPUT_DIM = 292 # src length
D_MODEL = 512
NUM_HEADS = 4
FNN_DIM = 1024
NUM_LAYERS = 10
DROPOUT = 0
CLIP = 1 # CLIP value
N_EPOCHS = 50
LR = 5e-6
start_time = time.time() # 开始计时
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
data = read_data("../dataset/round1_train_data.csv")
dataset = ReactionDataset(data)
subset_indices = list(range(N))
subset_dataset = Subset(dataset, subset_indices)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT)
model = model.to(device)
model.train()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
criterion = nn.MSELoss()
best_valid_loss = 10
for epoch in range(N_EPOCHS):
epoch_loss = 0
# adjust_learning_rate(optimizer, epoch, LR) # 动态调整学习率
for i, (src, y) in enumerate(train_loader):
src, y = src.to(device), y.to(device)
optimizer.zero_grad()
output = model(src)
loss = criterion(output, y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
optimizer.step()
epoch_loss += loss.detach().item()
# if i % 50 == 0:
# print(f'Step: {i} | Train Loss: {epoch_loss:.4f}')
loss_in_a_epoch = epoch_loss / len(train_loader)
scheduler.step(loss_in_a_epoch)
print(f'Epoch: {epoch+1:02} | Train Loss: {loss_in_a_epoch:.3f}', round(time.time()-start_time), 's', (time.time()-start_time)//60, 'minutes')
if loss_in_a_epoch < best_valid_loss:
best_valid_loss = loss_in_a_epoch
# 在训练循环结束后保存模型
torch.save(model.state_dict(), '../model/transformer.pth')
end_time = time.time() # 结束计时
# 计算并打印运行时间
elapsed_time_minute = (end_time - start_time)/60
print(f"Total running time: {elapsed_time_minute:.2f} minutes")
if __name__ == '__main__':
train()
# 生成结果文件
def predicit_and_make_submit_file(model_file, output_file):
INPUT_DIM = 292 # src length
D_MODEL = 512
NUM_HEADS = 4
FNN_DIM = 1024
NUM_LAYERS = 10
DROPOUT = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_data = read_data("../dataset/round1_test_data.csv", train=False)
test_dataset = ReactionDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
model = TransformerEncoderModel(INPUT_DIM, D_MODEL, NUM_HEADS, FNN_DIM, NUM_LAYERS, DROPOUT).to(device)
# 加载最佳模型
model.load_state_dict(torch.load(model_file))
model.eval()
output_list = []
for i, (src, y) in enumerate(test_loader):
src = src.to(device)
with torch.no_grad():
output = model(src)
output_list += output.detach().tolist()
ans_str_lst = ['rxnid,Yield']
for idx,y in enumerate(output_list):
ans_str_lst.append(f'test{idx+1},{y:.4f}')
with open(output_file,'w') as fw:
fw.writelines('\n'.join(ans_str_lst))
predicit_and_make_submit_file("../model/transformer.pth",
"../output/result.txt")
学习心得
最后是我的损失值,可惜没有到一个很好的效果,后面的结果都不用看了,损失必定下不去,改了很多参数,损失一直再0.052下不去。这次的夏令营让我有点没反应过来,从最开始的随机森林,准确率可以达0.3以上,到后面rnn仅有0.2不到,再到transformer最高只有0.1,我整个人都是一个蒙的状态,明明用的预测方法越来越高级了,但效果却越来越差了,如果说rnn仅对2万条化学数据提取特征,相对于庞大的化学体系来说,2万条数据太少了的话,那好歹transformer的效果也不会比rnn的效果还差吧,我总感觉是网络结构的问题,最起码神经网络还是能从这些化学数据中学到点东西的吧,效果不至于这么差吧。我之前再task2时使用了LSTM,效果就比rnn好了一点,结果到了task3,效果更不行了,因为代码都是torch集成好了的,增加层数就可以简单增加网络深度,而且使用的是SMILES特征,数据上除了数量少之外应该不会有太大问题,我还将溶剂和添加剂加进去了,同时加深了网络层数,但是这个网络的提取特特征的能力还是有点不尽人意。应该是我自己认知的局限吧,见识得少了,还得多加训练,也许是我还不清楚那些我所认为的高级货的局限性,而没能挖掘出其应发挥的价值,就是陷于庞大的知识体系也无法知其全貌。
“不识庐山真面目,只缘身在此山中”