2021SC@SDUSC
基于人工智能的多肽药物分析问题
主题:肽与HLA分子结合预测研究(3)
代码分析
代码结构
核心代码
接上回deephlapan_main()方法的后半部分
deephlapan_main.py
result = np.average(predScores, axis=0)
result1 = np.average(predScores1, axis=0)
with open(WD + '/' + fname + '_predicted_result.csv','w') as f:
f.write('Annotation,HLA,Peptide,binding score,immunogenic score\n')
if (opt.file):
for i in range(len(result)):
result[i]=("%.4f" % result[i])
result1[i]=("%.4f" % result1[i])
f.write(str(df.Annotation[i]) + ',' + str(df.HLA[i]) + ',' + str(df.peptide[i]) + ',' + str(result[i]) + ',' + str(result1[i]) + '\n')
else:
f.write('single peptide,' + str(hla) + ',' + str(peptide) + ',' + str(result[0]) + ',' + str(result1[0]) + '\n')
f.close()
if (opt.file):
command = 'perl ' + curDir + '/model/rank.pl ' + WD + '/' + fname + '_predicted_result.csv'
os.system(command)
j = datetime.datetime.now()
print (str(j) + ' Prediction end\n')
result 存放 predScores 的平均值
下面是这段代码未分析的前半部分
deephlapan_main.py
os.environ['CUDA_VISIBLE_DEVICES']='-1'
curDir=os.path.dirname(os.path.realpath(__file__))+'/'
HLA_seq=pd.read_csv(curDir+ 'model/MHC_pseudo.dat',sep = '\t')
seqs={}
for i in range(len(HLA_seq)):
seqs[HLA_seq.HLA[i]]=HLA_seq.sequence[i]
predScores=[]
predScores1=[]
aa_idx = {'A':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8, 'K':9, 'L':10, 'M':11, 'N':12, 'P':13, 'Q':14, 'R':15, 'S':16, 'T':17, 'V':18, 'W':19, 'Y':20, 'X':21}
os.path.realpath(__file __):获取包含py文件名的完整路径
os.path.dirname():去掉脚本的文件名,返回目录
接下来读取此目录下的model/MHC_pseudo.dat
文件,存入变量 HLA_seq
model/MHC_pseudo.dat
文件用excel打开内容如下:
然后将 HLA_seq 中内容一条条存入 seqs
设置数组 predScores=[] 和 predScores1=[],后面会用到
def run_model(i,X_test):
score = np.zeros((5, len(X_test)))
with CustomObjectScope({'Attention': Attention}):
model=load_model(curDir+ 'model/binding_model' + str(i+1)+ '.hdf5')
score[i,:] =np.squeeze(model.predict_proba(X_test))
return score[i,:]
def run_model1(i,X_test):
score1 = np.zeros((5, len(X_test)))
with CustomObjectScope({'Attention': Attention}):
model1=load_model(curDir+ 'model/immunogenicity_model' + str(i+1)+ '.hdf5')
score1[i,:]=np.squeeze(model1.predict_proba(X_test))
return score1[i,:]
加载自定义的模型,使用自定义对象作用域
model.predict_proba(X_test):返回的预测值为获得所有结果的概率
np.squeeze():从数组的形状中删除单维条目,即把shape中为1的维度去掉,存入score数组
def collect_result(result):
global predScores
predScores.append(result)
def collect_result1(result1):
global predScores1
predScores1.append(result1)
将结果加入 predScores 数组
def transform(HLA, peptide):
data=HLA+peptide
seq=data+'X'*(49-len(data))
seq=[aa_idx[x] for x in seq]
return seq
利用之前定义的 aa_idx 将数据进行转换
def read_and_prepare(file):
data=pd.read_csv(file)
data['cost_cents'] = data.apply(
lambda row: transform(
HLA=seqs[row['HLA']],
peptide=row['peptide']),
axis=1)
return np.vstack(data.cost_cents)
def read_and_prepare_single(peptide,hla):
complex=np.full((1, 49),21, int)
seq=[aa_idx[x] for x in list(seqs[hla] + peptide)]
for i in range(len(seq)):
complex[0,i]=seq[i]
return complex
利用前面的 transform(HLA, peptide) 函数将数据存入data
对于单条数据,直接在方法中进行transform