提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
RNAi利用siRNA精确抑制基因表达,减少蛋白质合成,通过RISC切割mRNA实现基因沉默,对基因疗法和疾病治疗有重要意义。随着人工智能的不断发展,机器学习这门技术也越来越重要。在机器学习中,模型训练通过优化算法调整参数以提升性能,常用MSE、MAE等指标评估。
本次比赛旨在利用机器学习技术,预测化学修饰后的siRNA序列在RNA干扰(RNAi)机制下对靶基因的沉默效率。RNAi是一种重要的基因表达调控机制,通过干扰特定基因的表达,可以用于疾病治疗。这次比赛的目标是通过构建并优化模型,准确预测siRNA的沉默效率,从而提升药物设计的效率和效果。
一、跑通baseline
【baseline链接】
https://datawhaler.feishu.cn/wiki/Wl5AwNiwMibQMjkkUxXcnpxvnNW
1.魔塔GPU环境
魔搭链接:https://modelscope.cn/my/mynotebook/preset
选择GPU环境
2.下载代码数据文件
3.运行baseline
在魔塔中上传文件,进入魔塔终端,解压数据集指令:
unzip siRNA_0715.zip
之后打开名字为task3.2_siRNA.ipynb
的代码文件,运行。
等待运行结束,得到submission.csv
结果文件,下载它并提交到官网,就完成了。
二、AI与
1.引入库
代码如下:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import Counter
from rich import print
from sklearn.metrics import precision_score, recall_score, mean_absolute_error
2.创建基因组分词器
class GenomicTokenizer:
def __init__(self, ngram=5, stride=2):
self.ngram = ngram
self.stride = stride
def tokenize(self, t):
t = t.upper()
if self.ngram == 1:
toks = list(t)
else:
toks = [t[i:i+self.ngram] for i in range(0, len(t), self.stride) if len(t[i:i+self.ngram]) == self.ngram]
if len(toks[-1]) < self.ngram:
toks = toks[:-1]
return toks
3.创建基因组词汇表:
class GenomicVocab:
def __init__(self, itos):
self.itos = itos
self.stoi = {v:k for k,v in enumerate(self.itos)}
@classmethod
def create(cls, tokens, max_vocab, min_freq):
freq = Counter(tokens)
itos = ['<pad>'] + [o for o,c in freq.most_common(max_vocab-1) if c >= min_freq]
return cls(itos)
4. siRNA数据集转换:
加载siRNA数据,并将序列数据转换为模型可以处理的格式
class SiRNADataset(Dataset):
def __init__(self, df, columns, vocab, tokenizer, max_len):
self.df = df
self.columns = columns
self.vocab = vocab
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
seqs = [self.tokenize_and_encode(row[col]) for col in self.columns]
target = torch.tensor(row['mRNA_remaining_pct'], dtype=torch.float)
return seqs, target
def tokenize_and_encode(self, seq):
if ' ' in seq: # Modified sequence
tokens = seq.split()
else: # Regular sequence
tokens = self.tokenizer.tokenize(seq)
encoded = [self.vocab.stoi.get(token, 0) for token in tokens] # Use 0 (pad) for unknown tokens
padded = encoded + [0] * (self.max_len - len(encoded))
return torch.tensor(padded[:self.max_len], dtype=torch.long)
5. siRNA Model
class SiRNAModel(nn.Module):
def __init__(self, vocab_size, embed_dim=200, hidden_dim=256, n_layers=3, dropout=0.5):
super(SiRNAModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.gru = nn.GRU(embed_dim, hidden_dim, n_layers, bidirectional=True, batch_first=True, dropout=dropout)
self.fc = nn.Linear(hidden_dim * 4, 1)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
embedded = [self.embedding(seq) for seq in x]
outputs = []
for embed in embedded:
x, _ = self.gru(embed)
x = self.dropout(x[:, -1, :]) # Use last hidden state
outputs.append(x)
x = torch.cat(outputs, dim=1)
x = self.fc(x)
return x.squeeze()
6. 评估指标计算函数:
该函数用于计算模型的各项评估指标,包括精确度、召回率、F1值和评分。根据比赛官方设定,将评价指标进行代码实现。
def calculate_metrics(y_true, y_pred, threshold=30):
mae = np.mean(np.abs(y_true - y_pred))
y_true_binary = (y_true < threshold).astype(int)
y_pred_binary = (y_pred < threshold).astype(int)
mask = (y_pred >= 0) & (y_pred <= threshold)
range_mae = mean_absolute_error(y_true[mask], y_pred[mask]) if mask.sum() > 0 else 100
precision = precision_score(y_true_binary, y_pred_binary, average='binary')
recall = recall_score(y_true_binary, y_pred_binary, average='binary')
f1 = 2 * precision * recall / (precision + recall)
score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
return score
7. 模型评估函数:
def evaluate_model(model, test_loader, device='cuda'):
model.eval()
predictions = []
targets = []
with torch.no_grad():
for inputs, target in test_loader:
inputs = [x.to(device) for x in inputs]
outputs = model(inputs)
predictions.extend(outputs.cpu().numpy())
targets.extend(target.numpy())
y_pred = np.array(predictions)
y_test = np.array(targets)
score = calculate_metrics(y_test, y_pred)
print(f"Test Score: {score:.4f}")
8. 模型训练函数:
用于训练模型,并在每个epoch后,评估模型的性能,保存最佳模型。
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50, device='cuda'):
model.to(device)
best_score = -float('inf')
best_model = None
for epoch in range(num_epochs):
model.train()
train_loss = 0
for inputs, targets in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
inputs = [x.to(device) for x in inputs]
targets = targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
model.eval()
val_loss = 0
val_preds = []
val_targets = []
with torch.no_grad():
for inputs, targets in val_loader:
inputs = [x.to(device) for x in inputs]
targets = targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
val_loss += loss.item()
val_preds.extend(outputs.cpu().numpy())
val_targets.extend(targets.cpu().numpy())
train_loss /= len(train_loader)
val_loss /= len(val_loader)
val_preds = np.array(val_preds)
val_targets = np.array(val_targets)
score = calculate_metrics(val_targets, val_preds)
print(f'Epoch {epoch+1}/{num_epochs}')
print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
print(f'Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')
print(f'Validation Score: {score:.4f}')
if score > best_score:
best_score = score
best_model = model.state_dict().copy()
print(f'New best model found with socre: {best_score:.4f}')
return best_model
9. 训练主程序:
优化模型的参数,使模型能够学习数据的特征并做出准确的预测或分类。
if __name__ == '__main__':
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load data
train_data = pd.read_csv('train_data.csv')
columns = ['siRNA_antisense_seq', 'modified_siRNA_antisense_seq_list']
train_data.dropna(subset=columns + ['mRNA_remaining_pct'], inplace=True)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
# Create vocabulary
tokenizer = GenomicTokenizer(ngram=3, stride=3)
all_tokens = []
for col in columns:
for seq in train_data[col]:
if ' ' in seq: # Modified sequence
all_tokens.extend(seq.split())
else:
all_tokens.extend(tokenizer.tokenize(seq))
vocab = GenomicVocab.create(all_tokens, max_vocab=10000, min_freq=1)
# Find max sequence length
max_len = max(max(len(seq.split()) if ' ' in seq else len(tokenizer.tokenize(seq))
for seq in train_data[col]) for col in columns)
# Create datasets
train_dataset = SiRNADataset(train_data, columns, vocab, tokenizer, max_len)
val_dataset = SiRNADataset(val_data, columns, vocab, tokenizer, max_len)
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
# Initialize model
model = SiRNAModel(len(vocab.itos))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
train_model(model, train_loader, val_loader, criterion, optimizer, 50, device)
10. 测试程序:
用于评估训练好的模型或软件在实际使用中的性能和表现。
总结
从零入门 AI for Science(AI+药物) 是 Datawhale 2024 年 AI 夏令营第三期的学习活动(“AI+药物”方向),基于天池平台“第二届世界科学智能大赛 生命科学赛道:siRNA药物药效预测”开展的实践学习。
欢迎你的加入!