# 核心功能实现代码
(1)PSI-blast批量生成
names=[name for name in os.listdir('//home/bhliu/data/negativetraindata1/01//') if os.path.isfile(os.path.join('//home/bhliu/data/negativetraindata1/01//', name))]
for each_item in names:
uniprotid=each_item.split('.')[0]
cmd='/home/bhliu/ncbi-blast-2.12.0+/bin/psiblast -num_iterations 3 -db /home/bhliu/uniref50.fasta -query /home/bhliu/data/negativetraindata1/01/'+uniprotid+'.fasta -out_ascii_pssm /home/bhliu/pssm/negativepssm/'+uniprotid+'.pssm '
#print(cmd)
os.system(cmd)
(2)特征提取
def PSSM_composition(proteinSeq_pro,proteinSeq,pssmMatrix):
#pssm_composition特征提取
PSSM_composition=[]
pssm_composition=[[0.0for m in range(20)] for n in range(20)]#创建一个20*20且元素都是0.0的矩阵
for k in range(20):
for i in range(len(proteinSeq)):
if proteinSeq_pro[k]==proteinSeq[i]:
for j in range(20): pssm_composition[k][j]=pssm_composition[k][j]+pssmMatrix[i][j]/len(pssmMatrix)
for a in pssm_composition:
for b in a:
PSSM_composition.append(b)
return PSSM_composition
def S_FPSSM(proteinSeq_pro,proteinSeq,pssmMatrix):
#S_FPSSM特征提取
F_PSSM=pssmMatrix#转换F_PSSM矩阵
for k in range(20):
for i in range(len(proteinSeq)):
if F_PSSM[i][k]<=0.0:
F_PSSM[i][k]=0.0
elif F_PSSM[i][k]>=7.0:
F_PSSM[i][k] = 7.0
S_fpssm=[]
s_fpssm=[[0.0for m in range(20)] for n in range(20)]
for k in range(20):
for i in range(len(proteinSeq)):
if proteinSeq_pro[k]==proteinSeq[i]:
for j in range(20):
s_fpssm[k][j]=s_fpssm[k][j]+F_PSSM[i][j]
for a in s_fpssm:
for b in a:
S_fpssm.append(b)
return S_fpssm
def RPM_PSSM(proteinSeq_pro,proteinSeq,pssmMatrix):
#RPM-PSSM特征提取
PPSSM=pssmMatrix#将原始PSSM矩阵转换为PPSSM
RPM_PSSM=[]
rpm_PSSM=[[0.0for m in range(20)] for n in range(20)]
for k in range(20):
for i in range(len(proteinSeq)):
if PPSSM[i][k]<=0.0:
PPSSM[i][k]=0
for k in range(20):
for i in range(len(proteinSeq)):
if proteinSeq_pro[k]==proteinSeq[i]:
for j in range(20):
rpm_PSSM[k][j]=rpm_PSSM[k][j]+PPSSM[i][j]/len(proteinSeq)
for a in rpm_PSSM:
for b in a:
RPM_PSSM.append(b)
return RPM_PSSM
(3)打分函数
def model_process(svm_model, test_data_x, test_data_y,cv_num,model_name):
p_lable = svm_model.predict(test_data_x)
print('总体精度为 : {}'.format(cv_num))
print('混淆矩阵为 :\n {}'.format(confusion_matrix(test_data_y, p_lable)))
print('kappa系数为 :\n {}'.format(cohen_kappa_score(test_data_y, p_lable)))
matric = confusion_matrix(test_data_y, p_lable)
TP = matric[1, 1] + 0.0
TN = matric[0, 0] + 0.0
FP = matric[0, 1] + 0.0
FN = matric[1, 0] + 0.0
recall = TP / (TP + FN)
precise = TP / (TP + FP)
specificity = TN / (TN + FP)
f1_score = 2 * (TP / (2 * TP + FP + FN))
MCC = ((TP * TN) - (FN * FP)) / math.sqrt((TP + FN) * (TN + FP) * (TP + FP) * (TN + FN))
print('{}的召回率(recall)为{:.4},特异度(specificity)为{:.4},精确率(precision)为{:.4},F1 score为{:.4},MCC值为{:.4} '.format(model_name, recall,specificity,precise,f1_score,MCC))
print(p_lable.tolist().count(0))
(4)交叉验证模型性能
def model_cv(path):
Pssm=csv_read.csv_read(path)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()#实例化
scaler.fit(Pssm)#fit,本质是生成均值和方差
# scaler.mean_#查看均值的属性mean_
# scaler.var_#查看方差的属性var_
pssm=scaler.transform(Pssm)#通过接口导出结果
y=[1for m in range(4000)]+[0for n in range(4000)]#1为毒力因子。0为非毒力因子
y=np.array(y)
train_x, test_x, train_y, test_y = train_test_split(pssm, y, test_size=0.2,random_state=420)
if path == 's_fpssm_train.csv':
j = 'S-FPSSM'
svc_model
基于序列进化信息的细菌毒力因子预测研究
于 2022-05-13 21:30:11 首次发布
![](https://img-home.csdnimg.cn/images/20240711042549.png)