Task 3
本次采用了transformer模型,虽然整体效果在这个赛题中我调节的不如随机森林,但以下使笔者的一些感悟。
在这个代码中最大的改进是更新了词汇表
# tokenizer,鉴于SMILES的特性,这里需要自己定义tokenizer和vocab
# 这里直接将smiles str按字符拆分,并替换为词汇表中的序号
class Smiles_tokenizer():
def __init__(self, pad_token, regex, vocab_file, max_length):
self.pad_token = pad_token
self.regex = regex
self.vocab_file = vocab_file
self.max_length = max_length
with open(self.vocab_file, "r") as f:
lines = f.readlines()
lines = [line.strip("\n") for line in lines]
vocab_dic = {}
for index, token in enumerate(lines):
vocab_dic[token] = index
self.vocab_dic = vocab_dic
def _regex_match(self, smiles):
regex_string = r"(" + self.regex + r"|"
regex_string += r".)"
prog = re.compile(regex_string)
tokenised = []
for smi in smiles:
tokens = prog.findall(smi)
if len(tokens) > self.max_length:
tokens = tokens[:self.max_length]
tokenised.append(tokens) # 返回一个所有的字符串列表
return tokenised
def tokenize(self, smiles):
tokens = self._regex_match(smiles)
# 添加上表示开始和结束的token:<cls>, <end>
tokens = [["<CLS>"] + token + ["<SEP>"] for token in tokens]
tokens = self._pad_seqs(tokens, self.pad_token)
token_idx = self._pad_token_to_idx(tokens)
return tokens, token_idx
def _pad_seqs(self, seqs, pad_token):
pad_length = max([len(seq) for seq in seqs])
padded = [seq + ([pad_token] * (pad_length - len(seq))) for seq in seqs]
return padded
def _pad_token_to_idx(self, tokens):
idx_list = []
new_vocab = []
for token in tokens:
tokens_idx = []
for i in token:
if i in self.vocab_dic.keys():
tokens_idx.append(self.vocab_dic[i])
else:
new_vocab.append(i)
self.vocab_dic[i] = max(self.vocab_dic.values()) + 1
tokens_idx.append(self.vocab_dic[i])
idx_list.append(tokens_idx)
with open("../new_vocab_list.txt", "a") as f:
for i in new_vocab:
f.write(i)
f.write("\n")
return idx_list
def _save_vocab(self, vocab_path):
with open(vocab_path, "w") as f:
for i in self.vocab_dic.keys():
f.write(i)
f.write("\n")
print("update new vocab!")
形成一个更好的训练模型。
之后将RNN改为transformer模型
# 模型
'''
直接采用一个transformer encoder model就好了
'''
class TransformerEncoderModel(nn.Module):
def __init__(self, input_dim, d_model, num_heads, fnn_dim, num_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, d_model)
self.layerNorm = nn.LayerNorm(d_model)
self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
nhead=num_heads,
dim_feedforward=fnn_dim,
dropout=dropout,
batch_first=True,
norm_first=True # pre-layernorm
)
self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer,
num_layers=num_layers,
norm=self.layerNorm)
self.dropout = nn.Dropout(dropout)
self.lc = nn.Sequential(nn.Linear(d_model, 256),
nn.Sigmoid(),
nn.Linear(256, 96),
nn.Sigmoid(),
nn.Linear(96, 1))
def forward(self, src):
# src shape: [batch_size, src_len]
embedded = self.dropout(self.embedding(src))
# embedded shape: [batch_size, src_len, d_model]
outputs = self.transformer_encoder(embedded)
# outputs shape: [batch_size, src_len, d_model]
# fisrt
z = outputs[:,0,:]
# z = torch.sum(outputs, dim=1)
# print(z)
# z shape: [bs, d_model]
outputs = self.lc(z)
# print(outputs)
# outputs shape: [bs, 1]
return outputs.squeeze(-1)-
为了更好的训练,笔者将N改为了数据集长度的0.4呗,依次递增。
至于代码改进中出现的先后定义的问题,稍微改进一下就行了