获取mmCIF对象的原子坐标

qq_27390023
于 2023-11-30 14:50:08 发布
阅读量100
点赞数
文章标签：生物信息学
本文链接：https://blog.csdn.net/qq_27390023/article/details/134712227
版权
mmCIF 格式字符串解析后得到ParsingResult（含有mmCIF对象（class MmcifObject）mmCIF 格式字符串解析，从mmCIF提取指定链上氨基酸的坐标，得到链上所有原子坐标和坐标位置掩码向量：all_positions 和 all_positions_mask。
import dataclasses
import numpy as np
from Bio import PDB
from typing import Mapping, Any, Optional, Tuple
import pickle

# PDB.Structure.Structure:biopython中处理PDB文件的一种数据结构
PdbStructure = PDB.Structure.Structure
PdbHeader = Mapping[str, Any]
ChainId = str
SeqRes = str


# Used to map SEQRES index to a residue in the structure.
@dataclasses.dataclass(frozen=True)
class ResiduePosition:
  chain_id: str
  residue_number: int
  insertion_code: str
    
    
@dataclasses.dataclass(frozen=True)
class ResidueAtPosition:
  position: Optional[ResiduePosition]
  name: str
  is_missing: bool
  hetflag: str


@dataclasses.dataclass(frozen=True)
class MmcifObject:
  """Representation of a parsed mmCIF file.

  Contains:
    file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
      files being processed.
    header: Biopython header.
    structure: Biopython structure.
    chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
      {'A': 'ABCDEFG'}
    seqres_to_structure: Dict; for each chain_id contains a mapping between
      SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
                                                        1: ResidueAtPosition,
                                                        ...}}
    raw_string: The raw string used to construct the MmcifObject.
  """
  file_id: str
  header: PdbHeader
  structure: PdbStructure
  chain_to_seqres: Mapping[ChainId, SeqRes]
  seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
  raw_string: Any


# This mapping is used when we need to store atom data in a format that requires
# fixed atom data size for every residue (e.g. a numpy array).
atom_types = [
    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
    'CZ3', 'NZ', 'OXT'
]
# OXT代表肽链的羟基终端。
atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
atom_type_num = len(atom_types)  # := 37.


class Error(Exception):
  """Base class for exceptions."""


class CaDistanceError(Error):
  """An error indicating that a CA atom distance exceeds a threshold."""


def _check_residue_distances(all_positions: np.ndarray,
                             all_positions_mask: np.ndarray,
                             max_ca_ca_distance: float):
  """Checks if the distance between unmasked neighbor residues is ok."""
  ca_position = atom_order['CA']
  prev_is_unmasked = False
  prev_calpha = None
  for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
    this_is_unmasked = bool(mask[ca_position])
    if this_is_unmasked:
      this_calpha = coords[ca_position]
      if prev_is_unmasked:
        distance = np.linalg.norm(this_calpha - prev_calpha)
        if distance > max_ca_ca_distance:
          raise CaDistanceError(
              'The distance between residues %d and %d is %f > limit %f.' % (
                  i, i + 1, distance, max_ca_ca_distance))
      prev_calpha = this_calpha
    prev_is_unmasked = this_is_unmasked


def get_atom_positions(
    mmcif_object: MmcifObject,
    auth_chain_id: str,
    max_ca_ca_distance: float) -> Tuple[np.ndarray, np.ndarray]:
  """Gets atom positions and mask from a list of Biopython Residues."""
  num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])
 
  # mmcif_object.structure 为PDB.Structure.Structure 对象
  relevant_chains = [c for c in mmcif_object.structure.get_chains()
                     if c.id == auth_chain_id]

  #print(f"type(relevant_chains): {type(relevant_chains)}")
  #print(f"relevant_chains: {relevant_chains}")
    
    
  if len(relevant_chains) != 1:
    raise MultipleChainsError(
        f'Expected exactly one chain in structure with id {auth_chain_id}.')
  # chain属于 Bio.PDB.Chain.Chain 类
  chain = relevant_chains[0]


  #print(f"class(chain): {type(chain)}")
  #print(f"chain: {chain}")
    
  # 先生成全0的向量：
  # all_positions 第一维为氨基酸位置，第二维为原子位置，第三维为原子坐标
  all_positions = np.zeros([num_res, atom_type_num, 3])
  all_positions_mask = np.zeros([num_res, atom_type_num],
                                 dtype=np.int64)
  
  # 遍历序列中所有的氨基酸
  for res_index in range(num_res):
    pos = np.zeros([atom_type_num, 3], dtype=np.float32)
    mask = np.zeros([atom_type_num], dtype=np.float32)
    
    # res_at_position属于 ResidueAtPosition 类
    res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
       
    #print(f"type(res_at_position):{type(res_at_position)}")
    #print(f"res_at_position:{res_at_position}")
    
    if not res_at_position.is_missing:
      # Biopython 中的 chain 对象中通过 (hetflag, residue_number, insertion_code) 组合来获取特定位置的氨基酸。
      # 如 chain[(' ', 3, ' ')])， res属于 Bio.PDB.Residue.Residue 类
      res = chain[(res_at_position.hetflag,
                   res_at_position.position.residue_number,
                   res_at_position.position.insertion_code)]
     
      #print(f"type(res):{type(res)}")
      #print(f"res:{res}")
      
      # 遍历这个氨基酸的所有原子
      for atom in res.get_atoms():
        atom_name = atom.get_name()
        #if atom_name == 'OXT':
        #   print("got oxt")
        
        #print(atom_name)
        # 得到原子的坐标    
        x, y, z = atom.get_coord()
        if atom_name in atom_order.keys():
          # 向量中该原子位置处赋值
          pos[atom_order[atom_name]] = [x, y, z]
          mask[atom_order[atom_name]] = 1.0
        # MSE，硒代甲硫氨酸（甲硫氨酸中的S被Se代替），SE表示硒元素
        elif atom_name.upper() == 'SE' and res.get_resname() == 'MSE':
          # Put the coordinates of the selenium atom in the sulphur column.
          pos[atom_order['SD']] = [x, y, z]  # delta位S元素
          mask[atom_order['SD']] = 1.0
    
      #print("======")
        
      # Fix naming errors in arginine residues where NH2 is incorrectly
      # assigned to be closer to CD than NH1.
      cd = atom_order['CD']
      nh1 = atom_order['NH1']
      nh2 = atom_order['NH2']
    
      #print(f"nh2,{nh2}")
      
      # 氨基酸为ARG，结构中'CD'，'NH1'，'NH2'原子坐标都存在以及'NH1'与'CD'的距离大于'NH2'与'CD'的距离
      # 可能是pdb数据库中'NH1'与'NH2'原子归属错了！
      if (res.get_resname() == 'ARG' and
          all(mask[atom_index] for atom_index in (cd, nh1, nh2)) and
          (np.linalg.norm(pos[nh1] - pos[cd]) >
           np.linalg.norm(pos[nh2] - pos[cd]))):
            
        #print(f"mask[atom_index],{mask[atom_index]}")
        
        # 'NH1'与'NH2'原子坐标及对应的mask位置互换
        pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
        mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()

    all_positions[res_index] = pos
    all_positions_mask[res_index] = mask
  _check_residue_distances(
      all_positions, all_positions_mask, max_ca_ca_distance)
  return all_positions, all_positions_mask


#### mmCIF格式字符串解析后结果保存
## 从 https://www.rcsb.org 下载好8jlp.cif 文件
#with open("8jlp.cif") as f:
#    mmcif_str = f.read()
#result = parse(file_id = '8jlp.cif', mmcif_string = mmcif_str)

#with open('test_mmcif_object.pkl', 'wb') as file:
#    # Dump the data into the file
#    pickle.dump(result.mmcif_object, file)


### 读入mmcif_object数据
# 打开二进制文件以进行读取
with open('test_mmcif_object.pkl', 'rb') as file:
    # 使用 pickle.load 从文件中加载对象
    test_mmcif_object = pickle.load(file)

#print(test_mmcif_object)

# print(result.mmcif_object)
chain_id = 'B' # 选择相应的链
all_positions, all_positions_mask = get_atom_positions(mmcif_object=test_mmcif_object, 
                                                       auth_chain_id=chain_id, 
                                                       max_ca_ca_distance=150.0)
print(all_positions.shape)
print(all_positions)