简介:
该项目为课后小练习,使用tensorflow2.10编写。
蛋⽩质的⼆级结构有3种模式:sheet, helix, loop
输⼊:蛋⽩质的氨基酸序列(字⺟表20的字符串)
输出:⼆级结构的类别(共3类)
数据集:共3000个样本
想要数据集可添加博主微信获取:l283293400
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
#导入所需包
# 读取数据
folder_path = './assignment1_data'
data = []
for filename in os.listdir(folder_path):
if filename.endswith('.pkl'):
with open(os.path.join(folder_path, filename), 'rb') as file:
sample = pickle.load(file)
data.append(sample)
#获取序列和结构最大长度
max_length_seq = max(len(sample['seq']) for sample in data)
max_length_ssp = max(len(sample['ssp']) for sample in data)
#构造氨基酸序列词典,其中,填充的字符由“_”表示
amino_acid_dict = {aa: idx for idx, aa in enumerate('ARNDCQEGHILKMFPSTWYV_')}
#从data中读取所有序列信息并转化为数字编码
sequences = [[amino_acid_dict[aa] for aa in sample['seq']] for sample in data]
#将序列长度填充至最大,其中,填充值使用20
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length_seq, padding='post', truncating='post',value=20)
#将序列转化为numpy格式
sequences = np.array(sequences)
#构造结构字典,填充的字符由“?”来表示
structure_to_index = {'H': 0, 'E': 1, 'C': 2,'?':3}
index_to_structure = {0:'H',1:'E',2:'C',3:'?'}
#读取结构信息,填充的数字由“3”来表示
structure = [[structure_to_index[aa] for aa in sample['ssp']] for sample in data]
structure = tf.keras.preprocessing.sequence.pad_sequences(structure, maxlen=max_length_ssp, padding='post', truncating='post',value=3)
structure = np.array(structure)
#划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(sequences, structure_ds, test_size=0.2, random_state=42)
#构造输入管道
train_ds = tf.data.Dataset.from_tensor_slices((X_train,y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test,y_test))
train_ds = train_ds.shuffle(1000).repeat().batch(64)
test_ds = test_ds.batch(64)
#搭建网络模型
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=21, output_dim=128, input_length=250))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.MaxPooling1D(padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Conv1DTranspose(filters=256, kernel_size=5,strides=2,activation='relu',padding='same'))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu',padding='same'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=4, kernel_size=1, activation='softmax'))
#编译网络
model.compile(
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001),
loss = 'categorical_crossentropy',
metrics = ['accuracy']
)
#进行训练并将训练过程保存
history = model.fit(
train_ds,
steps_per_epoch=2400//64,
epochs=300,
validation_data=test_ds,
validation_batch_size=600//64
)
#绘制损失曲线
plt.plot(history.epoch,history.history.get('loss'),label='train_loss')
plt.plot(history.epoch,history.history.get('val_loss'),label='val_loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()
#绘制accuracy曲线
plt.plot(history.epoch,history.history.get('accuracy'),label='train_accuracy')
plt.plot(history.epoch,history.history.get('val_accuracy'),label='val_accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend()
#测试样例
test_index = 1
sequence = X_test[test_index]
structure = y_test[test_index]
print("实际结构:")
for s in structure:
char = index_to_structure.get(np.argmax(s))
if char == '?':
continue
print(char,end='')
print("")
sequence = np.expand_dims(sequence,0) #因为预测是批量预测,所以要扩展sequence的维度
pre_structure = model.predict(sequence)
for s in pre_structure[0]:
char = index_to_structure.get(np.argmax(s))
if char == '?':
continue
print(char,end='')
# 预测序列长度250以内的蛋白质二级结构
def protein_predict(sequence):
sequence = [amino_acid_dict[char] for char in sequence] #转换为数字编码
sequence = np.expand_dims(sequence,0)
sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_length_seq, padding='post', truncating='post',value=20)
sequence = np.array(sequence)
pre_structure = model.predict(sequence)
for s in pre_structure[0]:
char = index_to_structure.get(np.argmax(s))
if char == '?':
continue
print(char,end='')
print('')
seq='ERDVNQLTPRERDILKLIAQGLPNKMIARRLDITESTVKVHVKHMLKKMKLKSRVEAAVWVHQERIF'
protein_predict(seq)
"""
1/1 [==============================] - 0s 21ms/step
CCCCCCCCHHHHHHHHHHHHCCCHHHHEECCCCHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHCC
真实结构:
CCCHHHCCHHHHHHHHHHHCCCCHHHHHHHHCCCHHHHHHHHHHHHHHHCCCCHHHHHHHHHHHCCC
"""