说明:parse_pdb_split_chain()
函数用于将PDB中包含的链拆开,包括对应的alpha-helix
、beta-sheet
、links
信息 (不足之处在于由于暂时没有遇到含有TURN
的PDB结构,所以暂时没有获取对应的turn
信息)。
import gzip
import re
def parse_pdb_split_chain(pdbgzFile,outpath):
with gzip.open(pdbgzFile,'rb') as pdbF:
pdbcontent = pdbF.read()
pdbcontent = pdbcontent.decode()
pattern = re.compile('ATOM\s+\d+\s*\w+\s*[A-Z]{3,4}\s*(\w)\s*.+\n',re.MULTILINE)
match = list(set(list(pattern.findall(pdbcontent))))
for chain in match:
patt_helix = re.compile('(HELIX\s+\w+\s*\w+\s*[A-Z]{3,4}\s*'+chain+'\s*.+)\n',re.MULTILINE)
patt_sheet = re.compile('(SHEET\s+\w+\s*\w+\s*\w+\s*[A-Z]{3,4}\s*'+chain+'\s*.+)\n',re.MULTILINE)
patt_links = re.compile('(LINK\s+\w+\s*\w+\s*'+chain+'\s*.+)\n',re.MULTILINE)
patt_cha = re.compile('(ATOM\s+\d+\s*\w+\s*[A-Z]{3,4}\s*'+chain+'\s*.+)\n',re.MULTILINE)
match_helix = patt_helix.findall(pdbcontent)
match_sheet = patt_sheet.findall(pdbcontent)
match_links = patt_links.findall(pdbcontent)
match_cha = patt_cha.findall(pdbcontent)
outfile = outpath+pdbgzFile.split('/')[-1].split('.')[0][3:].upper()+'_'+chain+'.pdb'
outF = open(outfile,'w')
for i in range(len(match_helix)): ## alpha-helix
outF.write(match_helix[i]+'\n')
for j in range(len(match_sheet)): ## beta-sheet
outF.write(match_sheet[j]+'\n')
for k in range(len(match_links)): ## Links
outF.write(match_links[k]+'\n')
for l in range(len(match_cha)): ## ATOM
outF.write(match_cha[l]+'\n')
outF.write('TER\n')
outF.write('END\n')
outF.close()
pdbgzFile
:指的是PDB的压缩文件。
outpath
: 指的是结果的输出目录。
输出文件的命名方式是:PDBID_chainID.pdb