项目包含两部分,自定义函数和主体
主体部分
import numpy as np
from Bio.PDB import *
from Bio import PDB
import os
import Bio.PDB.SASA
from Bio.PDB.SASA import ShrakeRupley
from pidchains import loadchains
from pidchains import searchchains
from pidchains import parse_pdb_split_chain
from pidchains import combine_file
from pidchains import Distance
from sklearn.metrics import pairwise_distances
from Bio.PDB.PDBParser import PDBParser
##############################################################################
#输入文件路径
##############################################################################
file= open("Masif32_list.txt", "r") ##需要下载的id文件
path1="Masif_list.txt" ##path1:原始未经提取的文件路径(仅有id chains)
path2="data0_6000.txt" ##path2:经过fasta序列提取后的文件(包含序列)
##############################################################################
#文件下载循环
##############################################################################
dictcombine={}
for i in file: ##对文件中id进行分割处理
j=i.strip().split("\n")
for k in range(0,1): ##循环次数
id=j[k]
#pdbl = PDBList() ####批量下载数据库PDB
#pdbl.retrieve_pdb_file(id, pdir='.', file_format='pdb')
##############################################################################
#储存id需要提取的链
##############################################################################
parser = PDBParser(PERMISSIVE = True, QUIET = True)
path4="D:\\pythonProject\\project1_ppidata\\sitework\\combinepdb\\%s.txt"%id
path3="D:\\pythonProject\\project1_ppidata\\sitework\\pdbxxent\\pdb\\pdb%s.ent"%id
data = parser.get_structure(id,path3) ##加载导入的PDB文件,提出需要提取的链名称
chains1name,chains2name=searchchains(path1,id) ##链名,A,B,H,N,FH,等
model=data.get_models()
models=list(model) ##提取MODEL结构
#print(type(chains1name))
#print(type(id))
###############################################################################
combinelist=[]
elementsite1=[]
elementsite2=[]
elementresidueid1=[]
elementresidueid2=[]
elementchains1=[]
elementchains2=[]
elementcontact1=[]
elementcontact2=[]
##############################################################################
# 每条链的序列提取
##############################################################################
for name in chains1name: ##输出链名以及链序列
namef1,chains=loadchains(id,name,path2)
if chains==None:
continue
else:
print(id,name)
#print(chains) ##打印对应的序列
elementchains1.append(chains)
if len(chains1name)==1 and not(namef1==1):
continue
for name in chains2name: ##输出链名以及链序列
namef2,chains=loadchains(id,name,path2)
if chains==None:
continue
else:
print(id,name)
#print(chains) ##打印对应的序列
elementchains2.append(chains)
if len(chains2name) == 1 and not (namef2 == 1):
continue
##############################################################################
#SASA
##############################################################################
#SASASC(data,chains1name,chains2name) ##SC:输出structure和chains的SASA
##____________________________________________________________________________
##____________________________________________________________________________
parse_pdb_split_chain(path3,id, "D:\\pythonProject\\project1_ppidata\\sitework\\pdbsplit\\", chains1name, chains2name) ##按照链名称将PDB文件分成链的PDB并保存
combine_file(id)
data=parser.get_structure(id,path4)
##____________________________________________________________________________
parser = PDBParser(PERMISSIVE=True, QUIET=True)
path4_1= "D:\\pythonProject\\project1_ppidata\\sitework\\pdbsplit\\"+id+"_"+chains1name+".pdb"
path4_2= "D:\\pythonProject\\project1_ppidata\\sitework\\pdbsplit\\"+id+"_"+chains2name+".pdb"
data2_1 = parser.get_structure(id,path4_1)
data2_2 = parser.get_structure(id,path4_2)
#chains1namelist = SASACR(data2_1, chains1name)
#chains2namelist = SASACR(data2_2, chains2name)
##____________________________________________________________________________
sr = ShrakeRupley()
sr.compute(data, level="R")
sr = ShrakeRupley()
sr.compute(data2_1, level="R")
sr = ShrakeRupley()
sr.compute(data2_2, level="R")
len1 = len(chains1name)
len2 = len(chains2name)
#chain = data[0][chains1name[:]] ##提取残基SASA
#chain2=data[0][chains2name[:]]
##____________________________________________________________________________
sasaresiduechain1=[]
sasaresiduechain2 = []
sasamodelchain1 = []
sasamodelchain2=[]
result = open("sasadata\\" + "result" + id + chains1name + ".csv", "a")
result2 = open("sasadata\\" + "result" + id + chains2name + ".csv", "a")
sitedata1 = open("sitedata\\" + "result" + id + chains1name + ".csv", "a")
sitedata2 = open("sitedata\\" + "result" + id + chains2name + ".csv", "a")
sitedatarearrangement1=open("sitedata_rearrangement\\" + "result" + id + chains1name + ".csv", "a")
sitedatarearrangement2 = open("sitedata_rearrangement\\" + "result" + id + chains2name + ".csv", "a")
##____________________________________________________________________________
for model in data2_1:
for chain in model:
for residue in chain:
elementresidueid1.append(residue.id[1])
for model in data2_2:
for chain in model:
for residue in chain:
elementresidueid2.append(residue.id[1])
##____________________________________________________________________________
rearrangenum=0
num = 0
rearrangenumlist1=[]
for n in range(0,len1):
chain=data[0][chains1name[n]]
for res in chain:
id2 = res.get_id()
if id2[0] == ' ' :
#print(chains1name[n],round(data[0][chains1name[n]][id[1]].sasa, 2),round(data2_1[0][chains1name][id[1]].sasa, 2),int(round(data2_1[0][chains1name][id[1]].sasa, 2)-round(data[0][chains1name][id[1]].sasa, 2)))
#num = num + 1
print(id2)
sasaresiduechain1.append(round(data2_1[0][chains1name[n]][id2[1]].sasa, 2))
sasamodelchain1.append(round(data[0][chains1name[n]][id2[1]].sasa, 2))
result.write(f"{sasamodelchain1[num]}\t{sasaresiduechain1[num]}\t{int(sasaresiduechain1[num])-int(sasamodelchain1[num])}\n")
if int(sasaresiduechain1[num])-int(sasamodelchain1[num]) >= 1:
sitedata1.write(f"1\t{chain.id}{id2[1]}\n")
sitedatarearrangement1.write(f"{str(rearrangenum)}\n")
rearrangenumlist1.append(rearrangenum)
rearrangenum=rearrangenum+1
elementsite1.append("1")
else:
sitedata1.write(f"0\n")
elementsite1.append("0")
rearrangenum=rearrangenum+1
num=num+1
rearrangenum = 0
num=0
rearrangenumlist2 = []
for n in range(0,len2):
chain2=data[0][chains2name[n]]
for res in chain2:
id2 = res.get_id()
if id2[0] == ' ' :
#print(chains2name,round(data[0][chains2name][id[1]].sasa, 2),round(data2_2[0][chains2name][id[1]].sasa, 2),int(round(data2_2[0][chains2name][id[1]].sasa, 2)-round(data[0][chains2name][id[1]].sasa, 2)))
print(id2)
sasamodelchain2.append(round(data[0][chains2name[n]][id2[1]].sasa, 2))
sasaresiduechain2.append(round(data2_2[0][chains2name[n]][id2[1]].sasa, 2))
result2.write(f"{sasamodelchain2[num]}\t{sasaresiduechain2[num]}\t{int(sasaresiduechain2[num])-int(sasamodelchain2[num])}\n")
if int(sasaresiduechain2[num]) - int(sasamodelchain2[num]) >= 1:
sitedata2.write(f"1\t{chain2.id}{id2[1]}\n")
sitedatarearrangement2.write(f"{str(rearrangenum)}\n")
rearrangenumlist2.append(rearrangenum)
rearrangenum = rearrangenum + 1
elementsite2.append("1")
else:
sitedata2.write(f"0\n")
rearrangenum = rearrangenum + 1
elementsite2.append("0")
num = num + 1
##____________________________________________________________________________
##############################################################################
#比较作用位点的距离
##############################################################################
#print(id,data2_1,data2_2,chains1name,chains2name)
##############################################################################
# 比较作用位点的距离
##############################################################################
structure1 = data2_1
structure2 = data2_2
"""This gets the coordinates for all.csv the atoms"""
x_y_z_1 = list()
x_y_z_2 = list()
# f=open("D:\\pythonProject\\seqdiy\\sitework\\distancedata\\"+id+".txt","w")
sitedata1 = open("D:\\pythonProject\\project1_ppidata\\sitework\\sitedata\\result" + id + chains1name + ".csv", "r")
sitedata2 = open("D:\\pythonProject\\project1_ppidata\\sitework\\sitedata\\result" + id + chains2name + ".csv", "r")
number1 = list()
number2 = list()
xiabiao1 = []
xiabiao2 = []
lenresidue1=0
lenresidue2=0
numresidue1=[0 for x in range(0,100)] ##用于插入GAP
numresidue2=[0 for x in range(0,100)] ##用于插入GAP
for string in sitedata1:
if string[2:-1]:
number = (string[2:-1])
number1.append(number)
#print(len(number1)) # 28
#print(number1)
if string[0:]:
lenresidue1=lenresidue1+1
for string in sitedata2:
if string[2:-1]:
number = (string[2:-1])
number2.append(number)
# print(len(number2))#25
#print(number2)
if string[0:]:
lenresidue2=lenresidue2+1
#print(len(number1),len(number2))
n = -1
for model in structure1:
for chain in model:
m = 1
n=n+1
for residue in chain:
chainnum = str(chain.id) + str(residue.id[1])
numresidue1[n]=m
m=m+1
# print(chainnum)
if chainnum in number1:
# print(residue.id[1],residue.id[1] in sitedata2,type(residue.id[1]))
for atom in residue:
# print(atom)
if "C" in atom.fullname and "A" in atom.fullname:
# print(atom.fullname,atom.coord[0],atom.coord[1],atom.coord[2])
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
if "C" in atom.fullname and "B" in atom.fullname:
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
x_y_z_1.append([x, y, z])
n = -1
for model in structure2:
for chain in model:
m = 1
n = n + 1
for residue in chain:
numresidue2[n] = m
m = m + 1
chainnum = str(chain.id) + str(residue.id[1])
#print(chainnum)
if chainnum in number2: ###多条链在一个文件中,且编码id相同。
# print(residue.id[1])
for atom in residue:
# print(atom.fullname)
if "C" in atom.fullname and "A" in atom.fullname:
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
if "C" in atom.fullname and "B" in atom.fullname:
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
x_y_z_2.append([x, y, z])
dic2 = {}
dic1 = {}
contactlist1=[]
contactlist2=[]
x_y_z_1 = np.array(x_y_z_1)
x_y_z_2 = np.array(x_y_z_2)
#print("2", len(x_y_z_2))
#print("1", len(x_y_z_1))
# f.write(f"{pairwise_distances(x_y_z_1,x_y_z_2)}")
# print(len(pairwise_distances(x_y_z_1,x_y_z_2)),len(x_y_z_2))
datalist12 = pairwise_distances(x_y_z_1, x_y_z_2) ###[[……],[……],[……],[……],[……]]
#print(datalist12, len(datalist12))
resultlistelement = list()
resultlist12 = list()
n = 1
for element in datalist12:
del resultlistelement ####[24,0,0,50,65,0,0,1,7,26,19]
resultlistelement = []
for i in range(0, len(x_y_z_2)):
if element[i] <= 8: ###24,0,0……
resultlistelement.append("1") ##[1]
else:
resultlistelement.append("0")
# if "1" not in resultlistelement: ###判断哪个小列表中没有1
# n=n+1
resultlist12.append(resultlistelement)
# print(resultlist12)
# ___________________________________________________________________________________________
store = []
for i in range(0, len(x_y_z_1)):
del store
# print("\n",id + " " + chains1name,end=":")
element = resultlist12[i]
if "1" in element:
# print("site :",number1[i],"residue(%s) in contact :"%chains2name,end=" " )
store = []
for j in range(0, len(x_y_z_2)):
if element[j] == "1":
# print(number2[j],end=" ")
store.append(number2[j])
dic1[number1[i]] = store
else:
# print("site :",number1[i],"residue(%s) in contact :"%chains2name,"F",end="" )
store = []
dic1[number1[i]] = "F"
# print("\n",dic1)
# ___________________________________________________________________________________________
datalist21 = pairwise_distances(x_y_z_2, x_y_z_1) ###[[……],[……],[……],[……],[……]]
resultlistelement = list()
resultlist21 = list()
n = 1
for element in datalist21:
del resultlistelement ####[24,0,0,50,65,0,0,1,7,26,19]
resultlistelement = []
for i in range(0, len(x_y_z_1)):
if element[i] <= 8: ###24,0,0……
resultlistelement.append("1") ##[1]
else:
resultlistelement.append("0")
# if "1" not in resultlistelement:
# print(n)
# n = n + 1
resultlist21.append(resultlistelement)
# print(resultlist21)
store = []
for i in range(0, len(x_y_z_2)):
del store
# print("\n", id + " " + chains2name, end=":")
element = resultlist21[i]
if "1" in element:
# print("site :", number2[i], "residue(%s) in contact :" % chains1name, end=" ")
store = []
for j in range(0, len(x_y_z_1)):
if element[j] == "1":
# print(number1[j], end=" ")
store.append(number1[j])
dic2[number2[i]] = store
else:
# print("site :", number2[i], "residue(%s) in contact :" % chains1name, "F", end="")
store = []
dic2[number2[i]] = "F"
#########################################################################################
#contactlist1.2
#########################################################################################
contactlista=0
emp=[]
resultlistnum=0 ###循环中遍历result列表[[0000000],[00001000],[00010000],[00111000]]的变量
for contactlista in range(0,lenresidue1): ##lenresidue1:sitedata1文件的列数
text = []
if contactlista in rearrangenumlist1:
element=resultlist12[resultlistnum]
resultlistnum=resultlistnum+1
if "1" in element :
for m in range(0,len(x_y_z_2)):
if element[m]=="1":
text.append(rearrangenumlist2[m])
contactlist1.append(text)
del text
else:
contactlist1.append(emp)
else:
contactlist1.append(emp)
contactlistb=0
emp=[]
resultlistnum=0 ###循环中遍历result列表[[0000000],[00001000],[00010000],[00111000]]的变量
for contactlistb in range(0,lenresidue2): ##lenresidue1:sitedata1文件的列数
text = []
if contactlistb in rearrangenumlist2:
element=resultlist21[resultlistnum]
resultlistnum=resultlistnum+1
if "1" in element :
for m in range(0,len(x_y_z_1)):
if element[m]=="1":
text.append(rearrangenumlist1[m])
contactlist2.append(text)
del text
else:
contactlist2.append(emp)
else:
contactlist2.append(emp)
#print(len(contactlist1))
#########################################################################################
# contactlist1.2
#########################################################################################
# print("\n", dic2)
# ___________________________________________________________________________________________
np.save("D:\\pythonProject\\project1_ppidata\\sitework\\distancedata\\" + id + "12.npy", resultlist12)
np.save("D:\\pythonProject\\project1_ppidata\\sitework\\distancedata\\" + id + "21.npy", resultlist21)
np.save("D:\\pythonProject\\project1_ppidata\\sitework\\residuecontactdict\\" + id + "12.npy", dic1)
np.save("D:\\pythonProject\\project1_ppidata\\sitework\\residuecontactdict\\" + id + "21.npy", dic2)
#dict1,dict2=Distance(id,data2_1,data2_2,chains1name,chains2name)
combinelist.append(id)
if len(chains1name)>1:
for i in range(0,len(chains1name)-1):
for n in range(0, 30):
elementsite1.insert(numresidue1[i],2)
elementresidueid1.insert(numresidue1[i],0)
#elementchains1.insert(numresidue1[i],0)
elementcontact1.insert(numresidue1[i],[])
if len(chains2name) > 1:
for i in range(0, len(chains2name) - 1):
for n in range(0,30):
elementsite2.insert(numresidue2[i], 2)
elementresidueid2.insert(numresidue2[i],0)
#elementchains1.insert(numresidue2[i], 0)
elementcontact1.insert(numresidue2[i], [])
combinelist.append(elementsite1)
combinelist.append(elementsite2)
combinelist.append(elementresidueid1)
combinelist.append(elementresidueid2)
combinelist.append(elementchains1)
combinelist.append(elementchains2)
combinelist.append(contactlist1)
combinelist.append(contactlist2)
#np.save("D:\\pythonProject\\seqdiy\\sitework\\combinedata\\"+id+".npy",combinelist)
#print(combinelist)
#np.loadtxt("D:\\pythonProject\\seqdiy\\sitework\\combinedata\\combinedata.txt")
dictcombine[id]=combinelist
#np.save("D:\\pythonProject\\project1_ppidata\\sitework\\combinedata\\all_out_data.npy",dictcombine)
print("finish",id,dictcombine)
pidchains:(自定义函数)需要improt
from Bio import PDB
import os
import Bio.PDB.SASA
from Bio.PDB.SASA import ShrakeRupley
import re
###############################################################################
#遍历文件查找id相同的条目,并提出条目后链的名称:1A0G_A_B ——> 1A0G chains1= A chains2= B
###############################################################################
def searchchains(path,id):
f=open(path,"r")
for seq in f: ##导入序列的链
seq_1 = seq.strip().split("\n")
for num in seq_1:
seq_2 = " ".join(seq_1)[0:4]
##MASIF中的id
if id == seq_2:
seq_3=num.strip().split("_") ##seq_3=以_分割的列表
chains1 = "".join(seq_3[1]).strip()
chains2 = "".join(seq_3[2]).strip() ##chains1和chains2保存,后续进行提取
#print('|entry|', seq_1, '|id|', seq_2, '|chains1|', chains1,
# '|chains2|', chains2)
if id == seq_2:
break
if id == seq_2:
break
return(chains1,chains2)
###############################################################################
#已知链id和which chains后提取序列(打印)
###############################################################################
def loadchains(id,name,path):
f=open(path,"r")
for seq in f:
seq_1=seq.strip().split("\n") #遍历已提取好的序列
seq_2=seq_1[0].strip("|").split("|") #seq:str,,seq_1:list
seq_3=seq_2[0] #seq_2:list
if id ==seq_3:
if name in " ".join(seq_2[1])[1:]:
if len(seq_2[3]) < 50 :
return(2,seq_2[3])
else:return(1,seq_2[3])
return(0,None)
#if name in " ".join(seq_2[1])[1:]:
#return(seq_2[3]) #这个是序列
###############################################################################
#SASA封装structure chains
###############################################################################
def SASASC(data,chains1name,chains2name):
num=0
model=data.get_models() ##提取文件中的模型信息
models=list(model)
sr = ShrakeRupley() ##SR
sr.compute(data, level="S") ##结构水平的计算
print('structure', round(data.sasa, 3))
sasastru = float(round(data.sasa, 2)) ##暂存
zchains = list(models[0].get_chains()) ##将chains信息储存列表当中
# print(len(zchains),zchains)
srchains = ShrakeRupley()
a = list(zchains) ##将列表中id与需要提取的id进行比对
for i in range(0, len(zchains)):
if chains1name == a[i].id:
srchains.compute(a[i], level="C")
sasachains = float(round(data.sasa, 2))
print(chains1name, round(a[i].sasa, 2))
if chains2name == a[i].id:
srchains.compute(a[i], level="C")
sasachains = float(round(data.sasa, 2))
print(chains2name, round(a[i].sasa, 2))
###############################################################################
#SASA封装 chains residue
###############################################################################
def SASACR(data,chains1name):
model=data.get_models() ##提取文件中的模型信息
models=list(model)
##暂存
zchains = list(models[0].get_chains()) ##将chains信息储存列表当中
# print(len(zchains),zchains)
srchains = ShrakeRupley()
a = list(zchains) ##将列表中id与需要提取的id进行比对
chains1namelist = {}
for i in range(0, len(zchains)):
if chains1name == a[i].id:
srchains.compute(a[i], level="C")
print(chains1name, round(a[i].sasa, 2))
residue = list(zchains[i].get_residues())
for m in range(0, len(residue)):
srchains.compute(residue[m], level="R")
a2=round(residue[m].sasa, 2)
#print(chains1name,a2)
chains1namelist[m]=a2
return (chains1namelist)
###############################################################################
#提取PDB残基SASA
###############################################################################
'''from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley
p = PDBParser(QUIET=1)
# This assumes you have a local copy of 1LCD.pdb in a directory called "PDB"
struct = p.get_structure("1A14", "1a14.pdb")
sr = ShrakeRupley()
sr.compute(struct, level="R")
chain = struct[0]['N']
for res in chain:
id = res.get_id()
if id[0] == ' ':
print(round(struct[0]["N"][id[1]].sasa, 2))
'''
###############################################################################
#将PDB文件中的链单独保存为PDB文件
###############################################################################
def parse_pdb_split_chain(pdbgzFile, id,outpath,chain1name,chain2name):
with open(pdbgzFile, 'rb') as pdbF:
pdbcontent = pdbF.read()
pdbcontent = pdbcontent.decode()
pattern = re.compile('ATOM\s+\d+\s*\w+\s*[A-Z]{3,4}\s*(\w)\s*.+\n', re.MULTILINE)
match = list(set(list(pattern.findall(pdbcontent))))
for chain in match:
if chain in chain1name :
patt_helix = re.compile('(HELIX\s+\w+\s*\w+\s*[A-Z]{3,4}\s*' + chain + '\s*.+)\n', re.MULTILINE)
patt_sheet = re.compile('(SHEET\s+\w+\s*\w+\s*\w+\s*[A-Z]{3,4}\s*' + chain + '\s*.+)\n', re.MULTILINE)
patt_links = re.compile('(LINK\s+\w+\s*\w+\s*' + chain + '\s*.+)\n', re.MULTILINE)
patt_cha = re.compile('(ATOM\s+\d+\s*\w+\s*[A-Z]{3,4}\s*' + chain + '\s*.+)\n', re.MULTILINE)
match_helix = patt_helix.findall(pdbcontent)
match_sheet = patt_sheet.findall(pdbcontent)
match_links = patt_links.findall(pdbcontent)
match_cha = patt_cha.findall(pdbcontent)
outfile = outpath + id + '_' + chain1name+ '.pdb'
outF = open(outfile, 'a')
for i in range(len(match_helix)): ## alpha-helix
outF.write(match_helix[i] + '\n')
for j in range(len(match_sheet)): ## beta-sheet
outF.write(match_sheet[j] + '\n')
for k in range(len(match_links)): ## Links
outF.write(match_links[k] + '\n')
for l in range(len(match_cha)): ## ATOM
outF.write(match_cha[l] + '\n')
outF.write('TER\n')
outF.write('END\n')
outF.close()
for chain in match:
if chain in chain2name:
patt_helix = re.compile('(HELIX\s+\w+\s*\w+\s*[A-Z]{3,4}\s*' + chain + '\s*.+)\n', re.MULTILINE)
patt_sheet = re.compile('(SHEET\s+\w+\s*\w+\s*\w+\s*[A-Z]{3,4}\s*' + chain + '\s*.+)\n', re.MULTILINE)
patt_links = re.compile('(LINK\s+\w+\s*\w+\s*' + chain + '\s*.+)\n', re.MULTILINE)
patt_cha = re.compile('(ATOM\s+\d+\s*\w+\s*[A-Z]{3,4}\s*' + chain + '\s*.+)\n', re.MULTILINE)
match_helix = patt_helix.findall(pdbcontent)
match_sheet = patt_sheet.findall(pdbcontent)
match_links = patt_links.findall(pdbcontent)
match_cha = patt_cha.findall(pdbcontent)
outfile = outpath + id + '_' + chain2name + '.pdb'
outF = open(outfile, 'a')
for i in range(len(match_helix)): ## alpha-helix
outF.write(match_helix[i] + '\n')
for j in range(len(match_sheet)): ## beta-sheet
outF.write(match_sheet[j] + '\n')
for k in range(len(match_links)): ## Links
outF.write(match_links[k] + '\n')
for l in range(len(match_cha)): ## ATOM
outF.write(match_cha[l] + '\n')
outF.write('TER\n')
outF.write('END\n')
outF.close()
###############################################################################
#合并文件,合并已分离的两个文件,达到与删除文件相同的效果
###############################################################################
import os
import linecache
def combine_file(id):
#### 读取指定路径下的所有文件并放入到列表中
root = "D:\\pythonProject\\project1_ppidata\\sitework\\pdbsplit" ###绝对路径
file_names = os.listdir(root)
#print(file_names)
file_ob_list = []
for file_name in file_names:
fileob = root + '/' + file_name
file_ob_list.append(fileob)
#print(file_ob_list)
### 对每个文件,按行读取文件内容并放入同一个列表data中
data = []
for file_ob in file_ob_list:
if id in file_ob:
line_num = 1
length_file = len(open(file_ob, encoding='utf-8').readlines())
#print(length_file)
while line_num <= length_file:
line = linecache.getline(file_ob, line_num)
line = line.strip()
data.append(line)
line_num = line_num + 1
### 将data内容写入到生成的txt文件中,注意编码问题
f = open('D:\\pythonProject\\project1_ppidata\\sitework\\combinepdb\\' + id + ".txt", 'w+', encoding='utf-8')
for i, p in enumerate(data):
#print(i, p)
f.write(p + '\n')
f.close()
###############################################################################
#筛选序列长度小于50且未与其他链相结合的蛋白质链
###############################################################################
def flit(chains,name):
trantab=str.maketrans("%s"%name,None)
return(chains.translate(trantab))
###############################################################################
#计算原子间距离
###############################################################################
#蛋白质残基接触的定义就是空间中2个氨基酸集团的Ca原子(一般用Ca原子来计算接触)的空间距离小于8Å(Å是距离单位)
from sklearn.metrics import pairwise_distances
import numpy as np
from Bio.PDB.PDBParser import PDBParser
import Bio.PDB
def Distance(id,data2_1,data2_2,chains1name,chains2name):
structure1=data2_1
structure2=data2_2
"""This gets the coordinates for all.csv the atoms"""
x_y_z_1=list()
x_y_z_2=list()
#f=open("D:\\pythonProject\\seqdiy\\sitework\\distancedata\\"+id+".txt","w")
sitedata1=open("D:\\pythonProject\\seqdiy\\sitework\\sitedata\\result"+ id +chains1name+".csv","r")
sitedata2=open("D:\\pythonProject\\seqdiy\\sitework\\sitedata\\result" + id + chains2name + ".csv","r")
number1=list()
number2=list()
xiabiao1=[]
xiabiao2=[]
for string in sitedata1:
if string[2:-1]:
number=(string[2:-1])
number1.append(number)
print(len(number1))#28
print(number1)
for string in sitedata2:
if string[2:-1]:
number=(string[2:-1])
number2.append(number)
#print(len(number2))#25
print(number2)
for model in structure1:
for chain in model:
for residue in chain:
chainnum=str(chain.id)+str(residue.id[1])
#print(chainnum)
if chainnum in number1:
#print(residue.id[1],residue.id[1] in sitedata2,type(residue.id[1]))
for atom in residue:
#print(atom)
if "C" in atom.fullname and "A" in atom.fullname:
#print(atom.fullname,atom.coord[0],atom.coord[1],atom.coord[2])
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
if "C" in atom.fullname and "B" in atom.fullname:
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
x_y_z_1.append([x, y, z])
for model in structure2:
for chain in model:
for residue in chain:
chainnum = str(chain.id) + str(residue.id[1])
print(chainnum)
if chainnum in number2: ###多条链在一个文件中,且编码id相同。
#print(residue.id[1])
for atom in residue:
#print(atom.fullname)
if "C" in atom.fullname and "A" in atom.fullname:
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
if "C" in atom.fullname and "B" in atom.fullname:
x = (atom.coord[0])
y = (atom.coord[1])
z = (atom.coord[2])
x_y_z_2.append([x, y, z])
dic2 = {}
dic1 = {}
x_y_z_1 = np.array(x_y_z_1)
x_y_z_2 = np.array(x_y_z_2)
print("2",x_y_z_2)
print("1",x_y_z_1)
#f.write(f"{pairwise_distances(x_y_z_1,x_y_z_2)}")
#print(len(pairwise_distances(x_y_z_1,x_y_z_2)),len(x_y_z_2))
datalist12=pairwise_distances(x_y_z_1,x_y_z_2) ###[[……],[……],[……],[……],[……]]
print(datalist12,len(datalist12))
resultlistelement=list()
resultlist12 = list()
n=1
for element in datalist12:
del resultlistelement ####[24,0,0,50,65,0,0,1,7,26,19]
resultlistelement=[]
for i in range(0,len(x_y_z_2)):
if element[i]<=8 : ###24,0,0……
resultlistelement.append("1") ##[1]
else:
resultlistelement.append("0")
#if "1" not in resultlistelement: ###判断哪个小列表中没有1
#n=n+1
resultlist12.append(resultlistelement)
#print(resultlist12)
#___________________________________________________________________________________________
store = []
for i in range(0,len(x_y_z_1)):
del store
#print("\n",id + " " + chains1name,end=":")
element=resultlist12[i]
if "1" in element:
#print("site :",number1[i],"residue(%s) in contact :"%chains2name,end=" " )
store=[]
for j in range(0,len(x_y_z_2)):
if element[j]== "1":
#print(number2[j],end=" ")
store.append(number2[j])
dic1[number1[i]]=store
else:
#print("site :",number1[i],"residue(%s) in contact :"%chains2name,"F",end="" )
store=[]
dic1[number1[i]]="F"
#print("\n",dic1)
#___________________________________________________________________________________________
datalist21 = pairwise_distances(x_y_z_2, x_y_z_1) ###[[……],[……],[……],[……],[……]]
resultlistelement = list()
resultlist21 = list()
n = 1
for element in datalist21:
del resultlistelement ####[24,0,0,50,65,0,0,1,7,26,19]
resultlistelement = []
for i in range(0, len(x_y_z_1)):
if element[i] <= 8: ###24,0,0……
resultlistelement.append("1") ##[1]
else:
resultlistelement.append("0")
#if "1" not in resultlistelement:
#print(n)
#n = n + 1
resultlist21.append(resultlistelement)
#print(resultlist21)
store = []
for i in range(0, len(x_y_z_2)):
del store
#print("\n", id + " " + chains2name, end=":")
element = resultlist21[i]
if "1" in element:
#print("site :", number2[i], "residue(%s) in contact :" % chains1name, end=" ")
store = []
for j in range(0, len(x_y_z_1)):
if element[j] == "1":
#print(number1[j], end=" ")
store.append(number1[j])
dic2[number2[i]] = store
else:
#print("site :", number2[i], "residue(%s) in contact :" % chains1name, "F", end="")
store = []
dic2[number2[i]] = "F"
#print("\n", dic2)
# ___________________________________________________________________________________________
np.save("D:\\pythonProject\\seqdiy\\sitework\\distancedata\\"+id+"12.npy",resultlist12)
np.save("D:\\pythonProject\\seqdiy\\sitework\\distancedata\\" + id + "21.npy", resultlist21)
np.save("D:\\pythonProject\\seqdiy\\sitework\\residuecontactdict\\"+id+"12.npy",dic1)
np.save("D:\\pythonProject\\seqdiy\\sitework\\residuecontactdict\\"+id+"12.npy",dic2)
return(dic1,dic2)
parser = PDBParser(PERMISSIVE=True, QUIET=True)
'''id="5H35"
chains1name="FG"
chains2name="E"
path4_1 = "D:\\pythonProject\\seqdiy\\sitework\\pdbsplit\\" + id + "_" + chains1name + ".pdb"
path4_2 = "D:\\pythonProject\\seqdiy\\sitework\\pdbsplit\\" + id + "_" + chains2name + ".pdb"
data2_1 = parser.get_structure(id,path4_1)
data2_2 = parser.get_structure(id,path4_2)
Distance(id,data2_1,data2_2,chains1name,chains2name)'''
import os
import linecache
def combine_fileDEBUG(id):
#### 读取指定路径下的所有文件并放入到列表中
root = "D:\\pythonProject\\seqdiy\\sitework\\ERRORWORK" ###绝对路径
file_names = os.listdir(root)
#print(file_names)
file_ob_list = []
for file_name in file_names:
fileob = root + '/' + file_name
file_ob_list.append(fileob)
#print(file_ob_list)
### 对每个文件,按行读取文件内容并放入同一个列表data中
data = []
for file_ob in file_ob_list:
if id in file_ob:
line_num = 1
length_file = len(open(file_ob, encoding='utf-8').readlines())
#print(length_file)
while line_num <= length_file:
line = linecache.getline(file_ob, line_num)
line = line.strip()
data.append(line)
line_num = line_num + 1
### 将data内容写入到生成的txt文件中,注意编码问题
f = open('D:\\pythonProject\\seqdiy\\sitework\\ERRORWORK\\' + id + ".txt", 'w+', encoding='utf-8')
for i, p in enumerate(data):
#print(i, p)
f.write(p + '\n')
f.close()
其中包含大量的路径,如果需要迁移运用到其他项目中,需要调节很多参数和路径。
可能需要完成相同项目的朋友不是很多,因此仅做记录,不再加以详细解读,如有问题可以私信