1.下载、可视化结构数据
from Bio.PDB import *
# 血红蛋白结构
rcsb_id = "1NS6"
pdbl = PDBList()
pdbl.retrieve_pdb_file(rcsb_id, pdir = '.', file_format = 'pdb')
parser = PDBParser(PERMISSIVE = True, QUIET = True)
pdb_file = "pdb" + rcsb_id.lower() + ".ent"
structure = parser.get_structure(rcsb_id,pdb_file)
print(structure)
print(structure.header)
import nglview
view = nglview.show_file(pdb_file)
view
2. 提取序列
## 提取多肽序列
# 两条A链,两条B链,四聚体
from Bio.PDB.Polypeptide import PPBuilder
ppb = PPBuilder()
for pp in ppb.build_peptides(structure):
print(len(pp.get_sequence()))
print(pp.get_sequence())
## 提取链序列:包括氨基酸序列、水分子等
# Structure.get_models()方法返回模型上的迭代器
model = structure.get_models()
models = list(model)
print("模型的数量:",end="")
print(len(models))
# Model.get_chain()方法返回链上的迭代器
chain_lst = list(models[0].get_chains())
print("链的数量:",end="")
print(len(chain_lst))
for chain in chain_lst:
print(chain)
#Chain.get_residues()方法返回残基上的迭代器
residue_lst = list(chain.get_residues())
print("残基长度(包括水分子等):",end="")
print(len(residue_lst))
print(residue_lst)
3. 提取小分子配体
def get_ligand(structure):
ligand_lst = []
model = structure.get_models()
models = list(model)
chain_lst = list(models[0].get_chains())
for chain in chain_lst:
residue_lst = list(chain.get_residues())
for residue in residue_lst:
residue_id = residue.get_id()
#print(residue_id)
if (residue_id[0] != ' ') and (residue_id[0] != 'W'):
# print(residue_id)
ligand_lst.append(residue_id)
return ligand_lst
ligands = get_ligand(structure)
print(ligands)
4. 计算原子距离
residue1 = chain_lst[0][1] # 第一条链的第一个氨基酸
residue2 = chain_lst[0][2] # 第一条链的第二个氨基酸
print(residue1)
print(residue2)
ca1 = residue1["CA"]
ca2 = residue2["CA"]
print(ca1)
print(ca2)
print(ca1.get_coord())
print(ca2.get_coord())
# - 符号已经重载
distance = ca1 - ca2
print(distance)
5. 筛选蛋白质中和配体原子距离接近的原子
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB.Selection import unfold_entities
heme_atom_lst = []
protein_atom_lst = []
# ('H_HEM', 142, ' ')
for residue in list(chain_lst[0].get_residues()):
if residue.get_resname() == 'HEM':
print(residue)
for atom in residue.get_atoms():
heme_atom_lst.append(atom)
#print(atom)
# print(heme_atom_lst)
close_atom_lst = []
limit = 5.0
for heme_atom in heme_atom_lst:
# for atom in structure.get_atoms():
for residue in structure.get_residues():
for atom in residue.get_atoms():
if atom not in heme_atom_lst:
if 0 <= atom - heme_atom <= limit:
print (heme_atom)
print (residue)
print (atom)
close_atom_lst.append((heme_atom,residue,atom))
print(close_atom_lst)
6. 筛选柔性区域
B-factor(又称Debye-Waller factor或temperature factor)是用来描述X-射线衍射蛋白晶体结构时由于原子热运动造成的射线衰减或散射现象。由于B-factor所体现的数值(B值)可用于识别蛋白结构中的原子、氨基酸侧链及loop区域的运动性及柔性,因而广泛应用于研究蛋白动力学、筛选生物活性小分子以及蛋白质工程领域,具有极为重要的科学研究意义。
# B factor大于15的所有原子坐标
for model in structure.get_list():
for chain in model.get_list():
for residue in chain.get_list():
if residue.has_id("CA"):
ca = residue["CA"]
if ca.get_bfactor() > 15.0:
print(ca.get_coord())
7. 计算角度和二面角
import Bio.PDB.vectors as vec
n = residue1["N"].get_vector()
ca = residue1["CA"].get_vector()
c = residue1["C"].get_vector()
o = residue1["O"].get_vector()
# 计算角度
print(vec.calc_angle(n, ca, c))
print(vec.calc_angle(n, ca, c)/(2*3.14) * 360)
# 计算二面角
print(vec.calc_dihedral(n, ca, c, o))
参考:
https://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ