Datawhale 夏令营第三期 机器学习 笔记2

  • Datawhale 夏令营第三期 机器学习 笔记2

    • 机器学习分类

      • 分类任务

        • 模型预测的结果是离散的值

      • 回归任务

        • 模型预测的结果是连续的值

      • 注:离散值通过一些处理可以近似认为是连续值

    • 深度学习

      • 归为机器学习的一个子集,主要通过神经网络学习数据的特征和分布。深度学习的一个重要进化是不再需要繁琐的特征工程,让神经网络自己从里面学习特征。

      • RNN(Recurrent Neural Network)是处理序列数据的一把好手。RNN的网络每层除了会有自己的输出以外,还会输出一个隐向量到下一层。

      • RNN的架构示意图

    • 代码如下:

    • !pip install pandas
      !pip install rdkit
      !pip install torch
      
      import re
      import time
      import pandas as pd
      from typing import List, Tuple
      import torch
      import torch.nn as nn
      import torch.optim as optim
      from torch.utils.data import Dataset, DataLoader, Subset
      
      # 定义RNN模型
      class RNNModel(nn.Module):
          def __init__(self, num_embed, input_size, hidden_size, output_size, num_layers, dropout, device):
              super(RNNModel, self).__init__()
              self.embed = nn.Embedding(num_embed, input_size)
              self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, 
                                batch_first=True, dropout=dropout, bidirectional=True)
              self.fc = nn.Sequential(nn.Linear(2 * num_layers * hidden_size, output_size),
                                      nn.Sigmoid(),
                                      nn.Linear(output_size, 1),
                                      nn.Sigmoid())
      
          def forward(self, x):
              # x : [bs, seq_len]
              x = self.embed(x)
              # x : [bs, seq_len, input_size]
              _, hn = self.rnn(x) # hn : [2*num_layers, bs, h_dim]
              hn = hn.transpose(0,1)
              z = hn.reshape(hn.shape[0], -1) # z shape: [bs, 2*num_layers*h_dim]
              output = self.fc(z).squeeze(-1) # output shape: [bs, 1]
              return output
      
      # import matplotlib.pyplot as plt
      ## 数据处理部分
      # tokenizer,鉴于SMILES的特性,这里需要自己定义tokenizer和vocab
      # 这里直接将smiles str按字符拆分,并替换为词汇表中的序号
      class Smiles_tokenizer():
          def __init__(self, pad_token, regex, vocab_file, max_length):
              self.pad_token = pad_token
              self.regex = regex
              self.vocab_file = vocab_file
              self.max_length = max_length
      
              with open(self.vocab_file, "r") as f:
                  lines = f.readlines()
              lines = [line.strip("\n") for line in lines]
              vocab_dic = {}
              for index, token in enumerate(lines):
                  vocab_dic[token] = index
              self.vocab_dic = vocab_dic
      
          def _regex_match(self, smiles):
              regex_string = r"(" + self.regex + r"|"
              regex_string += r".)"
              prog = re.compile(regex_string)
      
              tokenised = []
              for smi in smiles:
                  tokens = prog.findall(smi)
                  if len(tokens) > self.max_length:
                      tokens = tokens[:self.max_length]
                  tokenised.append(tokens) # 返回一个所有的字符串列表
              return tokenised
          
          def tokenize(self, smiles):
              tokens = self._regex_match(smiles)
              # 添加上表示开始和结束的token:<cls>, <end>
              tokens = [["<CLS>"] + token + ["<SEP>"] for token in tokens]
              tokens = self._pad_seqs(tokens, self.pad_token)
              token_idx = self._pad_token_to_idx(tokens)
              return tokens, token_idx
      
          def _pad_seqs(self, seqs, pad_token):
              pad_length = max([len(seq) for seq in seqs])
              padded = [seq + ([pad_token] * (pad_length - len(seq))) for seq in seqs]
              return padded
      
          def _pad_token_to_idx(self, tokens):
              idx_list = []
              for token in tokens:
                  tokens_idx = []
                  for i in token:
                      if i in self.vocab_dic.keys():
                          tokens_idx.append(self.vocab_dic[i])
                      else:
                          self.vocab_dic[i] = max(self.vocab_dic.values()) + 1
                          tokens_idx.append(self.vocab_dic[i])
                  idx_list.append(tokens_idx)
              
              return idx_list
      
      # 读数据并处理
      def read_data(file_path, train=True):
          df = pd.read_csv(file_path)
          reactant1 = df["Reactant1"].tolist()
          reactant2 = df["Reactant2"].tolist()
          product = df["Product"].tolist()
          additive = df["Additive"].tolist()
          solvent = df["Solvent"].tolist()
          if train:
              react_yield = df["Yield"].tolist()
          else:
              react_yield = [0 for i in range(len(reactant1))]
          
          # 将reactant拼到一起,之间用.分开。product也拼到一起,用>分开
          input_data_list = []
          for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
              input_info = ".".join([react1, react2])
              input_info = ">".join([input_info, prod])
              input_data_list.append(input_info)
          output = [(react, y) for react, y in zip(input_data_list, react_yield)]
      
          # 下面的代码将reactant\additive\solvent拼到一起,之间用.分开。product也拼到一起,用>分开
          '''
          input_data_list = []
          for react1, react2, prod, addi, sol in zip(reactant1, reactant2, product, additive, solvent):
              input_info = ".".join([react1, react2, addi, sol])
              input_info = ">".join([input_info, prod])
              input_data_list.append(input_info)
          output = [(react, y) for react, y in zip(input_data_list, react_yield)]
          '''
      
          # # 统计seq length,序列的长度是一个重要的参考,可以使用下面的代码统计查看以下序列长度的分布
          # seq_length = [len(i[0]) for i in output]
          # seq_length_400 = [len(i[0]) for i in output if len(i[0])>200]
          # print(len(seq_length_400) / len(seq_length))
          # seq_length.sort(reverse=True)
          # plt.plot(range(len(seq_length)), seq_length)
          # plt.title("templates frequence")
          # plt.show()
          return output
      
      class ReactionDataset(Dataset):
          def __init__(self, data: List[Tuple[List[str], float]]):
              self.data = data
              
          def __len__(self):
              return len(self.data)
      
          def __getitem__(self, idx):
              return self.data[idx]
          
      def collate_fn(batch):
          REGEX = r"\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
          tokenizer = Smiles_tokenizer("<PAD>", REGEX, "../vocab_full.txt", max_length=300)
          smi_list = []
          yield_list = []
          for i in batch:
              smi_list.append(i[0])
              yield_list.append(i[1])
          tokenizer_batch = torch.tensor(tokenizer.tokenize(smi_list)[1])
          yield_list = torch.tensor(yield_list)
          return tokenizer_batch, yield_list
      
      
      def train():
          ## super param
          N = 10  #int / int(len(dataset) * 1)  # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
          NUM_EMBED = 294 # nn.Embedding()
          INPUT_SIZE = 300 # src length
          HIDDEN_SIZE = 512
          OUTPUT_SIZE = 512
          NUM_LAYERS = 10
          DROPOUT = 0.2
          CLIP = 1 # CLIP value
          N_EPOCHS = 10
          LR = 0.001
          
          start_time = time.time()  # 开始计时
          device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
          # device = 'cpu'
          data = read_data("../dataset/round1_train_data.csv")
          dataset = ReactionDataset(data)
          subset_indices = list(range(N))
          subset_dataset = Subset(dataset, subset_indices)
          train_loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
      
          model = RNNModel(NUM_EMBED, INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, DROPOUT, device).to(device)
          model.train()
          
          optimizer = optim.Adam(model.parameters(), lr=LR)
          # criterion = nn.MSELoss() # MSE
          criterion = nn.L1Loss() # MAE
      
          best_loss = 10
          for epoch in range(N_EPOCHS):
              epoch_loss = 0
              for i, (src, y) in enumerate(train_loader):
                  src, y = src.to(device), y.to(device)
                  optimizer.zero_grad()
                  output = model(src)
                  loss = criterion(output, y)
                  loss.backward()
                  torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                  optimizer.step()
                  epoch_loss += loss.item()
                  loss_in_a_epoch = epoch_loss / len(train_loader)
              print(f'Epoch: {epoch+1:02} | Train Loss: {loss_in_a_epoch:.3f}')
              if loss_in_a_epoch < best_loss:
                  # 在训练循环结束后保存模型
                  torch.save(model.state_dict(), '../model/RNN.pth')
          end_time = time.time()  # 结束计时
          # 计算并打印运行时间
          elapsed_time_minute = (end_time - start_time)/60
          print(f"Total running time: {elapsed_time_minute:.2f} minutes")
      
      if __name__ == '__main__':
          train()
      
      
      # 生成结果文件
      def predicit_and_make_submit_file(model_file, output_file):
          NUM_EMBED = 294
          INPUT_SIZE = 300
          HIDDEN_SIZE = 512
          OUTPUT_SIZE = 512
          NUM_LAYERS = 10
          DROPOUT = 0.2
          device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
          test_data = read_data("../dataset/round1_test_data.csv", train=False)
          test_dataset = ReactionDataset(test_data)
          test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn) 
      
          model = RNNModel(NUM_EMBED, INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, DROPOUT, device).to(device)
          # 加载最佳模型
          model.load_state_dict(torch.load(model_file))
          model.eval()
          output_list = []
          for i, (src, y) in enumerate(test_loader):
              src, y = src.to(device), y.to(device)
              with torch.no_grad():
                  output = model(src)
                  output_list += output.detach().tolist()
          ans_str_lst = ['rxnid,Yield']
          for idx,y in enumerate(output_list):
              ans_str_lst.append(f'test{idx+1},{y:.4f}')
          with open(output_file,'w') as fw:
              fw.writelines('\n'.join(ans_str_lst))
      
          print("done!!!")
          
      predicit_and_make_submit_file("../model/RNN.pth",
                                    "../output/RNN_submit.txt")

      由于没有调整参数,最后得分:-0.0330

    • 有待进一步加强RNN的原理理解和pytorch用法的学习

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值