Datawhale AI夏令营 AI+化学 task2

本次任务:RNN建模SMILES进行反应产率预测

引入库

定义模型

定义数据处理函数以及tokenizer

# import matplotlib.pyplot as plt
## 数据处理部分
# tokenizer,鉴于SMILES的特性,这里需要自己定义tokenizer和vocab
# 这里直接将smiles str按字符拆分,并替换为词汇表中的序号
class Smiles_tokenizer():
    def __init__(self, pad_token, regex, vocab_file, max_length):
        self.pad_token = pad_token
        self.regex = regex
        self.vocab_file = vocab_file
        self.max_length = max_length

        with open(self.vocab_file, "r") as f:
            lines = f.readlines()
        lines = [line.strip("\n") for line in lines]
        vocab_dic = {}
        for index, token in enumerate(lines):
            vocab_dic[token] = index
        self.vocab_dic = vocab_dic

    def _regex_match(self, smiles):
        regex_string = r"(" + self.regex + r"|"
        regex_string += r".)"
        prog = re.compile(regex_string)

        tokenised = []
        for smi in smiles:
            tokens = prog.findall(smi)
            if len(tokens) > self.max_length:
                tokens = tokens[:self.max_length]
            tokenised.append(tokens) # 返回一个所有的字符串列表
        return tokenised
    
    def tokenize(self, smiles):
        tokens = self._regex_match(smiles)
        # 添加上表示开始和结束的token:<cls>, <end>
        tokens = [["<CLS>"] + token + ["<SEP>"] for token in tokens]
        tokens = self._pad_seqs(tokens, self.pad_token)
        token_idx = self._pad_token_to_idx(tokens)
        return tokens, token_idx

    def _pad_seqs(self, seqs, pad_token):
        pad_length = max([len(seq) for seq in seqs])
        padded = [seq + ([pad_token] * (pad_length - len(seq))) for seq in seqs]
        return padded

    def _pad_token_to_idx(self, tokens):
        idx_list = []
        for token in tokens:
            tokens_idx = []
            for i in token:
                if i in self.vocab_dic.keys():
                    tokens_idx.append(self.vocab_dic[i])
                else:
                    self.vocab_dic[i] = max(self.vocab_dic.values()) + 1
                    tokens_idx.append(self.vocab_dic[i])
            idx_list.append(tokens_idx)
        
        return idx_list

# 读数据并处理
def read_data(file_path, train=True):
    df = pd.read_csv(file_path)
    reactant1 = df["Reactant1"].tolist()
    reactant2 = df["Reactant2"].tolist()
    product = df["Product"].tolist()
    additive = df["Additive"].tolist()
    solvent = df["Solvent"].tolist()
    if train:
        react_yield = df["Yield"].tolist()
    else:
        react_yield = [0 for i in range(len(reactant1))]
    
    # 将reactant拼到一起,之间用.分开。product也拼到一起,用>分开
    input_data_list = []
    for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
        input_info = ".".join([react1, react2])
        input_info = ">".join([input_info, prod])
        input_data_list.append(input_info)
    output = [(react, y) for react, y in zip(input_data_list, react_yield)]

    # 下面的代码将reactant\additive\solvent拼到一起,之间用.分开。product也拼到一起,用>分开
    '''
    input_data_list = []
    for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
        input_info = ".".join([react1, react2, addi, sol])
        input_info = ">".join([input_info, prod])
        input_data_list.append(input_info)
    output = [(react, y) for react, y in zip(input_data_list, react_yield)]
    '''

    # # 统计seq length,序列的长度是一个重要的参考,可以使用下面的代码统计查看以下序列长度的分布
    # seq_length = [len(i[0]) for i in output]
    # seq_length_400 = [len(i[0]) for i in output if len(i[0])>200]
    # print(len(seq_length_400) / len(seq_length))
    # seq_length.sort(reverse=True)
    # plt.plot(range(len(seq_length)), seq_length)
    # plt.title("templates frequence")
    # plt.show()
    return output

class ReactionDataset(Dataset):
    def __init__(self, data: List[Tuple[List[str], float]]):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
def collate_fn(batch):
    REGEX = r"\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
    tokenizer = Smiles_tokenizer("<PAD>", REGEX, "../vocab_full.txt", max_length=300)
    smi_list = []
    yield_list = []
    for i in batch:
        smi_list.append(i[0])
        yield_list.append(i[1])
    tokenizer_batch = torch.tensor(tokenizer.tokenize(smi_list)[1])
    yield_list = torch.tensor(yield_list)
    return tokenizer_batch, yield_list

-------------------------------------------------------------------------------------------------------------------------

def train():
    ## super param
    N = 10  #int / int(len(dataset) * 1)  # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
    NUM_EMBED = 294 # nn.Embedding()
    INPUT_SIZE = 300 # src length
    HIDDEN_SIZE = 512
    OUTPUT_SIZE = 512
    NUM_LAYERS = 10
    DROPOUT = 0.2
    CLIP = 1 # CLIP value
    N_EPOCHS = 100
    LR = 0.0001
    
    start_time = time.time()  # 开始计时
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = 'cpu'
    data = read_data("../dataset/round1_train_data.csv")
    dataset = ReactionDataset(data)
    subset_indices = list(range(N))
    subset_dataset = Subset(dataset, subset_indices)
    train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)

    model = RNNModel(NUM_EMBED, INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, DROPOUT, device).to(device)
    model.train()
    
    optimizer = optim.Adam(model.parameters(), lr=LR)
    # criterion = nn.MSELoss() # MSE
    criterion = nn.L1Loss() # MAE

    best_loss = 10
    for epoch in range(N_EPOCHS):
        epoch_loss = 0
        for i, (src, y) in enumerate(train_loader):
            src, y = src.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(src)
            loss = criterion(output, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            optimizer.step()
            epoch_loss += loss.item()
            loss_in_a_epoch = epoch_loss / len(train_loader)
        print(f'Epoch: {epoch+1:02} | Train Loss: {loss_in_a_epoch:.3f}')
        if loss_in_a_epoch < best_loss:
            # 在训练循环结束后保存模型
            torch.save(model.state_dict(), '../model/RNN.pth')
    end_time = time.time()  # 结束计时
    # 计算并打印运行时间
    elapsed_time_minute = (end_time - start_time)/60
    print(f"Total running time: {elapsed_time_minute:.2f} minutes")

if __name__ == '__main__':
    train()

得到结果文件

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值