使用1D CNN预测蛋白质二级结构

最新推荐文章于 2024-06-08 19:07:59 发布

kaiaaaa

最新推荐文章于 2024-06-08 19:07:59 发布

阅读量262

点赞数 3

分类专栏：深度学习文章标签： cnn 人工智能神经网络

本文链接：https://blog.csdn.net/kaiaaaa/article/details/139169344

版权

深度学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

简介：
该项目为课后小练习，使用tensorflow2.10编写。
蛋⽩质的⼆级结构有3种模式：sheet, helix, loop
输⼊：蛋⽩质的氨基酸序列（字⺟表20的字符串）
输出：⼆级结构的类别（共3类）
数据集：共3000个样本
想要数据集可添加博主微信获取：l283293400

import tensorflow as tf
import os
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
#导入所需包

# 读取数据
folder_path = './assignment1_data'
data = []
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        with open(os.path.join(folder_path, filename), 'rb') as file:
            sample = pickle.load(file)
            data.append(sample)

#获取序列和结构最大长度
max_length_seq = max(len(sample['seq']) for sample in data)
max_length_ssp = max(len(sample['ssp']) for sample in data)

#构造氨基酸序列词典，其中，填充的字符由“_”表示
amino_acid_dict = {aa: idx for idx, aa in enumerate('ARNDCQEGHILKMFPSTWYV_')}

#从data中读取所有序列信息并转化为数字编码
sequences = [[amino_acid_dict[aa] for aa in sample['seq']] for sample in data]

#将序列长度填充至最大，其中，填充值使用20
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length_seq, padding='post', truncating='post',value=20)

#将序列转化为numpy格式
sequences = np.array(sequences)

#构造结构字典，填充的字符由“？”来表示
structure_to_index = {'H': 0, 'E': 1, 'C': 2,'?':3}
index_to_structure = {0:'H',1:'E',2:'C',3:'?'}

#读取结构信息，填充的数字由“3”来表示
structure = [[structure_to_index[aa] for aa in sample['ssp']] for sample in data]
structure = tf.keras.preprocessing.sequence.pad_sequences(structure, maxlen=max_length_ssp, padding='post', truncating='post',value=3)
structure = np.array(structure)

#划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(sequences, structure_ds, test_size=0.2, random_state=42)

#构造输入管道
train_ds = tf.data.Dataset.from_tensor_slices((X_train,y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test,y_test))
train_ds = train_ds.shuffle(1000).repeat().batch(64)
test_ds = test_ds.batch(64)

#搭建网络模型
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=21, output_dim=128, input_length=250))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.MaxPooling1D(padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Conv1DTranspose(filters=256, kernel_size=5,strides=2,activation='relu',padding='same'))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=4, kernel_size=1, activation='softmax'))

#编译网络
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

#进行训练并将训练过程保存
history = model.fit(
    train_ds,
    steps_per_epoch=2400//64,
    epochs=300,
    validation_data=test_ds,
    validation_batch_size=600//64
)

#绘制损失曲线
plt.plot(history.epoch,history.history.get('loss'),label='train_loss')
plt.plot(history.epoch,history.history.get('val_loss'),label='val_loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

#绘制accuracy曲线
plt.plot(history.epoch,history.history.get('accuracy'),label='train_accuracy')
plt.plot(history.epoch,history.history.get('val_accuracy'),label='val_accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend()

#测试样例
test_index = 1
sequence = X_test[test_index]
structure = y_test[test_index]
print("实际结构：")
for s in structure:
    char = index_to_structure.get(np.argmax(s))
    if char == '?':
        continue
    print(char,end='')
print("")
sequence = np.expand_dims(sequence,0) #因为预测是批量预测，所以要扩展sequence的维度
pre_structure = model.predict(sequence)
for s in pre_structure[0]:
    char = index_to_structure.get(np.argmax(s))
    if char == '?':
        continue
    print(char,end='')

# 预测序列长度250以内的蛋白质二级结构
def protein_predict(sequence):
    sequence = [amino_acid_dict[char] for char in sequence] #转换为数字编码
    sequence = np.expand_dims(sequence,0)
    sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_length_seq, padding='post', truncating='post',value=20)
    sequence = np.array(sequence)
    pre_structure = model.predict(sequence)
    for s in pre_structure[0]:
        char = index_to_structure.get(np.argmax(s))
        if char == '?':
            continue
        print(char,end='')
    print('')

seq='ERDVNQLTPRERDILKLIAQGLPNKMIARRLDITESTVKVHVKHMLKKMKLKSRVEAAVWVHQERIF'
protein_predict(seq)
"""
1/1 [==============================] - 0s 21ms/step
CCCCCCCCHHHHHHHHHHHHCCCHHHHEECCCCHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHCC
真实结构：
CCCHHHCCHHHHHHHHHHHCCCCHHHHHHHHCCCHHHHHHHHHHHHHHHCCCCHHHHHHHHHHHCCC
"""

kaiaaaa

关注

3
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
使用1D CNN预测蛋白质二级结构

蛋⽩质的⼆级结构有3种模式：sheet, helix, loop。该项目为课后小练习，使用tensorflow2.10编写。想要数据集可添加博主微信获取：l283293400。输⼊：蛋⽩质的氨基酸序列（字⺟表20的字符串）输出：⼆级结构的类别（共3类）数据集：共3000个样本。
复制链接

扫一扫