前言:
1、这是一个生物信息学专业学生的毕业设计项目,是一个基于TensorFlow的蛋白质家族鉴定模型。
2、这段代码处于最终阶段,如果你想运行这段代码,请你从https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/下载 Pfam-A.fasta.gz文件,解压得到Pfam-A.fasta,在代码所在目录创建一个data文件夹,将其放入data文件夹中。
3、关于这个模型的详细介绍我会在毕业后给出,现在涉及到我的论文,暂不便给出
4、你可以从wh650/tensorflow-to-Protein-family-identification (github.com)下载得到完整的代码。
代码部分:
1. first.py
from Bio import SeqIO
from collections import defaultdict
import os
os.chdir('./data')
# Pfam-A.fasta文件路径
file_path = 'Pfam-A.fasta'
# 初始化家族计数器
family_counter = defaultdict(int)
selected_records = []
current_family = "" # 当前正在处理的家族
# 打开fasta文件并开始迭代
with open(file_path, 'r') as handle:
for record in SeqIO.parse(handle, 'fasta'):
# 描述符中第三部分是家族ID,例如"PF10417.13"
family_id = record.description.split()[2].split(';')[0]
# 更新正在处理的家族
if family_id != current_family:
current_family = family_id
# 如果当前家族计数器小于400,加入到选中记录中
if family_counter[family_id] < 400:
selected_records.append(record)
family_counter[family_id] += 1
# 当达到400时,输出进度
if family_counter[family_id] == 400:
print(f"提取完成第{len(family_counter)}个家族的400条蛋白质,家族ID: {family_id}")
# 如果已经有5000个家族的记录被选中,且当前家族也已经有了400条记录,终止循环
if len(family_counter) == 5000:
break
# 现在selected_records包含了前000个家族的前400条蛋白质序列
# 写入新的fasta文件
output_file_path = 'selected_proteins5000_400.fasta'
with open(output_file_path, 'w') as output_file:
SeqIO.write(selected_records, output_file, 'fasta')
print('新的fasta文件已创建,包含前5000个家族的前400条蛋白质。')
2.csv1.py
import os
from Bio import SeqIO
os.chdir('./data')
import csv
def process_fasta(fasta_file, csv_file):
with open(fasta_file, 'r') as file:
with open(csv_file, 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(['family_label', 'protein_sequence'])
protein_tag = ''
protein_seq = ''
for line in file:
if line.startswith('>'):
if protein_seq: # 当读取到新的标签时,保存上一个蛋白质的信息
csvwriter.writerow([protein_tag, protein_seq])
protein_seq = '' # 重置蛋白质序列
# 处理蛋白质标签
parts = line.split()
for part in parts:
if part.startswith('PF'):
protein_tag = part.split(';')[0]
break
else:
protein_seq += line.strip()
# 保存文件中最后一个蛋白质的信息
if protein_tag and protein_seq:
csvwriter.writerow([protein_tag, protein_seq])
# 对两个fasta文件分别进行处理
process_fasta('selected_proteins5000_400.fasta', 'selected_proteins5000_400.csv')
# process_fasta('selected_proteins50000_301_to_303.fasta', 'selected_proteins50000_301_to_303.csv')
3. csv2.py
import os
os.chdir('./data')
import pandas as pd
# 读取原始文件
df = pd.read_csv('selected_proteins5000_400.csv')
# 均匀打乱排序
df_shuffled = df.sample(frac=1).reset_index(drop=True)
# 重新输出为新文件
df_shuffled.to_csv('selected_proteins5000_400stuff.csv', index=False)
4. Dropout.py
import os
os.chdir('./data')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
# 读取数据
batch_size = 5000
data_chunks = pd.read_csv('selected_proteins5000_400stuff.csv', chunksize=batch_size)
# 数据预处理
max_seq_length = 48
tokenizer = Tokenizer(char_level=True)
family_labels = []
sequences = []
for chunk in data_chunks:
for index, row in chunk.iterrows():
family_labels.append(row['family_label'])
seq = row['protein_sequence'][:max_seq_length]
seq = seq.ljust(max_seq_length, 'X') # 使用字符'X'补齐至30个字符
sequences.append(seq)
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)
# 将家族标签转换为数字编码
label_to_index = {label: index for index, label in enumerate(np.unique(family_labels))}
index_to_label = {index: label for label, index in label_to_index.items()}
labels = [label_to_index[label] for label in family_labels]
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
# 将数据转换为tf.Tensor类型
X_train_tf = tf.constant(X_train)
X_val_tf = tf.constant(X_val)
X_test_tf = tf.constant(X_test)
y_train_tf = tf.constant(y_train)
y_val_tf = tf.constant(y_val)
y_test_tf = tf.constant(y_test)
# 定义不同的Dropout参数
dropout_values = [0.6]
# 存储训练过程中的Loss和Accuracy
with open('training_logs.txt', 'w') as f:
for dropout_value in dropout_values:
# 构建模型
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_seq_length),
tf.keras.layers.LSTM(128, return_sequences=True),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(dropout_value), # 调整Dropout参数
tf.keras.layers.Dense(len(label_to_index), activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
# 训练模型
history = model.fit(X_train_tf, y_train_tf, validation_data=(X_val_tf, y_val_tf), epochs=10, batch_size=128)
# 保存训练过程中的Loss和Accuracy
f.write(f"Dropout: {dropout_value}\n")
f.write(f"Training Loss: {history.history['loss']}\n")
f.write(f"Validation Loss: {history.history['val_loss']}\n")
f.write(f"Training Accuracy: {history.history['accuracy']}\n")
f.write(f"Validation Accuracy: {history.history['val_accuracy']}\n\n")
# 绘制Loss曲线
plt.figure()
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Dropout = {dropout_value}')
plt.legend()
plt.savefig(f'./picture/loss_plot_dropout_{dropout_value}.png')
# 绘制Accuracy曲线
plt.figure()
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title(f'Dropout = {dropout_value}')
plt.legend()
plt.savefig(f'./picture/accuracy_plot_dropout_{dropout_value}.png')
# 选择最佳的Dropout参数
# 根据生成的曲线选择表现最好的模型,可以根据Validation Loss和Validation Accuracy来判断模型的性能
5. 归一化.py
import os
os.chdir('./data')
import pandas as pd
# 读取Excel文件
data = pd.read_excel('test2.xlsx', header=None)
# 定义数据范围和权重
t_min, t_max = data[0].min(), data[0].max()
l_min, l_max = data[1].min(), data[1].max()
a_min, a_max = data[2].min(), data[2].max()
weights = [0.2, 0.4, 0.4]
# 归一化处理
data[0] = (t_max - data[0]) / (t_max - t_min)
data[1] = (l_max - data[1]) / (l_max - l_min)
data[2] = (data[2] - a_min) / (a_max - a_min)
# 计算综合得分
data['综合得分'] = weights[0] * data[0] + weights[1] * data[1] + weights[2] * data[2]
# 将结果输出到test.csv文件
data.to_csv('test2.csv', index=False, header=False)
6.length.py
import os
os.chdir('./data')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 读取数据
batch_size = 5000
# 数据预处理
lengths_to_test = range(10, 100, 5) # 截取长度范围为10到50,步长为5
results = []
for max_seq_length in lengths_to_test:
data_chunks = pd.read_csv('selected_proteins300_400stuff.csv', chunksize=batch_size) # 重新创建data_chunks
tokenizer = Tokenizer(char_level=True)
family_labels = []
sequences = []
for chunk in data_chunks:
for index, row in chunk.iterrows():
family_labels.append(row['family_label'])
seq = row['protein_sequence'][:max_seq_length]
seq = seq.ljust(max_seq_length, 'X') # 使用字符'X'补齐至指定长度
sequences.append(seq)
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)
# 将家族标签转换为数字编码
label_to_index = {label: index for index, label in enumerate(np.unique(family_labels))}
labels = [label_to_index[label] for label in family_labels]
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
# 将数据转换为tf.Tensor类型
X_train_tf = tf.constant(X_train)
X_val_tf = tf.constant(X_val)
X_test_tf = tf.constant(X_test)
y_train_tf = tf.constant(y_train)
y_val_tf = tf.constant(y_val)
y_test_tf = tf.constant(y_test)
# 构建模型和训练,评估模型的代码略去不变
# 构建模型
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_seq_length),
tf.keras.layers.LSTM(128, return_sequences=True),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(len(label_to_index), activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
# 训练模型
model.fit(X_train_tf, y_train_tf, validation_data=(X_val_tf, y_val_tf), epochs=10, batch_size=128)
# 在测试集上评估模型
loss, accuracy = model.evaluate(X_test_tf, y_test_tf)
results.append({'Length': max_seq_length, 'Test Loss': loss, 'Test Accuracy': accuracy})
print(f'截取每个蛋白质的前{max_seq_length}个氨基酸得到模型 Test loss: {loss} Test accuracy: {accuracy}')
# 将结果保存到CSV文件
results_df = pd.DataFrame(results)
results_df.to_csv('length2.csv', index=False)
7.超参数.py
import os
os.chdir('./data')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 读取数据
batch_size = 1000
data_chunks = pd.read_csv('selected_proteins300_400stuff.csv', chunksize=batch_size)
# 数据预处理
max_seq_length = 50
tokenizer = Tokenizer(char_level=True)
family_labels = []
sequences = []
for chunk in data_chunks:
for index, row in chunk.iterrows():
family_labels.append(row['family_label'])
seq = row['protein_sequence'][:max_seq_length]
seq = seq.ljust(max_seq_length, 'X')
sequences.append(seq)
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)
label_to_index = {label: index for index, label in enumerate(np.unique(family_labels))}
labels = [label_to_index[label] for label in family_labels]
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
X_train_tf = tf.constant(X_train)
X_val_tf = tf.constant(X_val)
X_test_tf = tf.constant(X_test)
y_train_tf = tf.constant(y_train)
y_val_tf = tf.constant(y_val)
y_test_tf = tf.constant(y_test)
# 设置参数组合
embedding_dims = [64, 128, 256] # Embedding层的维度
lstm_units = [64, 128, 256] # LSTM层每层的单元数
lstm_layers = [1, 2, 3] # LSTM层的深度
epochs = 10
# 创建结果保存的DataFrame
results_df = pd.DataFrame(columns=['Embedding_dims', 'LSTM_units', 'LSTM_layers', 'Test_loss', 'Test_accuracy'])
for emb_dim in embedding_dims:
for unit in lstm_units:
for layer in lstm_layers:
print(f'Running for: Embedding_dims={emb_dim}, LSTM_units={unit}, LSTM_layers={layer}')
# 构建模型
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=emb_dim, input_length=max_seq_length),
])
for i in range(layer):
model.add(tf.keras.layers.LSTM(unit, return_sequences=i < layer - 1))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(len(label_to_index), activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
model.fit(X_train_tf, y_train_tf, validation_data=(X_val_tf, y_val_tf), epochs=epochs)
# 在测试集上评估模型
loss, accuracy = model.evaluate(X_test_tf, y_test_tf)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')
results_df = pd.concat([results_df, pd.DataFrame({'Embedding_dims': [emb_dim], 'LSTM_units': [unit], 'LSTM_layers': [layer], 'Test_loss': [loss], 'Test_accuracy': [accuracy]})], ignore_index=True)
# 保存结果到csv文件
results_df.to_csv('model_results.csv', index=False)
8. 模型评估.py
import os
os.chdir('./data')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
# 读取数据
batch_size = 5000
data_chunks = pd.read_csv('selected_proteins5000_400stuff.csv', chunksize=batch_size)
# 数据预处理
max_seq_length = 50
tokenizer = Tokenizer(char_level=True)
family_labels = []
sequences = []
for chunk in data_chunks:
for index, row in chunk.iterrows():
family_labels.append(row['family_label'])
seq = row['protein_sequence'][:max_seq_length]
seq = seq.ljust(max_seq_length, 'X')
sequences.append(seq)
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)
label_to_index = {label: index for index, label in enumerate(np.unique(family_labels))}
labels = [label_to_index[label] for label in family_labels]
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
X_train_tf = tf.constant(X_train)
X_val_tf = tf.constant(X_val)
X_test_tf = tf.constant(X_test)
y_train_tf = tf.constant(y_train)
y_val_tf = tf.constant(y_val)
y_test_tf = tf.constant(y_test)
# 设置Embedding层为64,LSTM每层256个单元,3层LSTM,epochs为10
embedding_dim = 64
lstm_units = 256
lstm_layers = 3
epochs = 20
# 构建模型
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_seq_length),
])
for i in range(lstm_layers):
model.add(tf.keras.layers.LSTM(lstm_units, return_sequences=i < lstm_layers - 1))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(len(label_to_index), activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001), metrics=['accuracy'])
# 模型评估
history = model.fit(X_train_tf, y_train_tf, validation_data=(X_val_tf, y_val_tf), epochs=epochs, batch_size=256)
# 在测试集上评估模型
loss, accuracy = model.evaluate(X_test_tf, y_test_tf)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')
# 计算精确率、召回率、F1分数和混淆矩阵
y_probs = model.predict(X_test_tf)
y_pred = np.argmax(y_probs, axis=1)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred)
# 将结果保存到result文件夹中
result_dir = './result'
os.makedirs(result_dir, exist_ok=True)
np.savetxt(os.path.join(result_dir, 'confusion_matrix.txt'), conf_matrix, fmt='%d')
with open(os.path.join(result_dir, 'evaluation_metrics.txt'), 'w') as f:
f.write(f'Accuracy: {accuracy}\n')
f.write(f'Precision: {precision}\n')
f.write(f'Recall: {recall}\n')
f.write(f'F1 Score: {f1}')
# 绘制ROC曲线和计算AUC值
y_probs = model.predict(X_test_tf)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(label_to_index)):
fpr[i], tpr[i], _ = roc_curve(y_test_tf, y_probs[:, i], pos_label=i)
roc_auc[i] = auc(fpr[i], tpr[i])
# 计算平均AUC值
plt.figure()
for i in range(len(label_to_index)):
plt.plot(fpr[i], tpr[i], lw=2, label='Class {0} (AUC = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
# 保存AUC值到CSV文件
auc_df = pd.DataFrame.from_dict(roc_auc, orient='index', columns=['AUC'])
auc_df.index.name = 'Class'
auc_df.to_csv(os.path.join(result_dir, 'auc_scores.csv'))
plt.savefig(os.path.join(result_dir, 'roc_curve.png'))
plt.close()
import seaborn as sns
# 绘制混淆矩阵热图
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=False, cmap='Blues', xticklabels=False, yticklabels=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.savefig(os.path.join(result_dir, 'confusion_matrix_heatmap.png'))
plt.close()
# 绘制学习曲线
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig(os.path.join(result_dir, 'accuracy_learning_curve.png'))
plt.close()
# 绘制验证集和训练集准确率随训练周期变化的曲线
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig(os.path.join(result_dir, 'accuracy_learning_curve2.png'))
plt.close()
9.model.py
import os
os.chdir('./data')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 读取数据
batch_size = 1000
data_chunks = pd.read_csv('selected_proteins5000_400stuff.csv', chunksize=batch_size)
# 数据预处理
max_seq_length = 50
tokenizer = Tokenizer(char_level=True)
family_labels = []
sequences = []
for chunk in data_chunks:
for index, row in chunk.iterrows():
family_labels.append(row['family_label'])
seq = row['protein_sequence'][:max_seq_length]
seq = seq.ljust(max_seq_length, 'X')
sequences.append(seq)
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)
label_to_index = {label: index for index, label in enumerate(np.unique(family_labels))}
labels = [label_to_index[label] for label in family_labels]
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
X_train_tf = tf.constant(X_train)
X_val_tf = tf.constant(X_val)
X_test_tf = tf.constant(X_test)
y_train_tf = tf.constant(y_train)
y_val_tf = tf.constant(y_val)
y_test_tf = tf.constant(y_test)
# 设置Embedding层为64,LSTM每层256个单元,3层LSTM,epochs为10
embedding_dim = 64
lstm_units = 256
lstm_layers = 3
epochs = 10
# 构建模型
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_seq_length),
])
for i in range(lstm_layers):
model.add(tf.keras.layers.LSTM(lstm_units, return_sequences=i < lstm_layers - 1))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(len(label_to_index), activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001), metrics=['accuracy'])
model.fit(X_train_tf, y_train_tf, validation_data=(X_val_tf, y_val_tf), epochs=epochs,batch_size=256)
# 在测试集上评估模型
loss, accuracy = model.evaluate(X_test_tf, y_test_tf)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')