利用深度学习对蛋白质二级结构三态预测
通过输入蛋白质的氨基酸序列,预测其蛋白质结构,本文先通过训练100多万条的 pseudo label 数据,获得一个 pre-pretrain-model,接着使用3万多条labeled的数据进行fine-tuning,获得一个最终的模型作为预测
原创不易,共同学习.请诸位大神在转载时标明出处.
数据集示例
>1UCSA
NKASVVANQLIPINTALTLIMMKAEVVTPMGIPAEEIPKLVGMQVNRAVPLGTTLMPDMVKNYE
CCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEECCCECCCCECCHHHECCCC
依次对应为:
seqs = {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'Q': 5, 'E': 6, 'G': 7, 'H': 8,
'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 'S': 15, 'T': 16,
'W': 17, 'Y': 18, 'V': 19}
label = {'C': 0, 'H': 1, 'E': 2}
生成的对应矩阵为:
2 11 0 15 19 19 0 2 5 10 9 14 9 2 16 0 10 16 10 9 12 12 11 0 6 19 19 16 14 12 7 9 14 0 6 6 9 14 11 10 19 7 12 5 19 2 1 0 19 14 10 7 16 16 10 12 14 3 12 19 11 2 18 6
0 0 0 2 2 2 2 0 0 0 2 0 0 0 0 2 0 0 1 1 1 2 2 2 2 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 0 0 0 2 2 0 0 0 2 0 0 0 0 2 0 0 1 1 1 2 0 0 0 0
将矩阵作为输入,main代码如下:
import pdb
import sys
import os
from arg import getArgparse
# os.environ["CUDA_VISIBLE_DEVICES"] = '1'
import torch
from torch import nn
from network import S4PRED
from get_dataset import loadfasta
from torch.utils.data import DataLoader
import torch.optim as optim
import datetime
from sklearn.metrics import f1_score, precision_score, recall_score
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
start = datetime.datetime.now()
args_dict = getArgparse()
device = torch.device(args_dict['device'])
learn_rate = args_dict['learn_rate']
pre_train_epochs = args_dict['pre_train_epochs']
fine_tuning_epochs = args_dict['fine_tuning_epochs']
fine_tuning_batch_size = args_dict['fine_tuning_batch_size']
pre_train_batch_size = args_dict['pre_train_batch_size']
save_dpath = args_dict['save_path']
# test_flag = args_dict['test_flag']
if not os.path.exists(save_dpath):
os.mkdir(save_dpath)
criterion = nn.CrossEntropyLoss(ignore_index=3)
model = S4PRED().to(device)
optimizer = optim.Adam(model.parameters(), lr=learn_rate, betas=(0.9, 0.999))
lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1)
pretrain_accuracy = [0.0]
fine_tuning_accuracy = [0.0]
def main():
test_loader = DataLoader(loadfasta("test"), batch_size=1, shuffle=False, num_workers=8)
final_model_path = os.path.join(save_dpath, 'fine-tuning-train-best.pkl')
pre_model_path = os.path.join(save_dpath, 'pre-train-best.pkl')
if os.path.exists(final_model_path):
print('Starting test...')
test(model, test_loader)
return
train_loader = DataLoader(loadfasta("train"), batch_size=fine_tuning_batch_size, shuffle=True, num_workers=16)
valid_loader = DataLoader(loadfasta("valid"), batch_size=1, shuffle=False, num_workers=8)
if os.path.exists(pre_model_path):
print('Load the pre-trained model...')
model.load_state_dict(torch.load(os.path.join(save_dpath, 'pre-train-best.pkl')))
print('Starting fine-tuning...')
for epoch in range(fine_tuning_epochs):
print('##Fine-Tuning Epoch-%s' % epoch)
train(model, train_loader)
valid(model, valid_loader, epoch, False)
test(model, test_loader)
else:
print(