文章目录
本文主要讲 modlamp 的多肽描述符计算部分, 文档在这,挑选 modlamp.descriptors 较难理解的部分
1.GlobalDescriptor
这个类根据序列计算每个多肽的全局描述符
from modlamp.descriptors import *
data=['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']
desc=GlobalDescriptor(data)
desc 对象存储了多肽序列库 (data) 的描述符矩阵,通过调用 desc 对象的方法来得到不同的描述符,存在 desc.descriptor 属性里
1.1.formula
desc.formula(amide=True,append=False) #amide表示C端是否酰胺化
desc.descriptor
"""
array([['C42 H66 N12 O10'],
['C64 H121 N23 O17'],
['C57 H102 N20 O15 S1']], dtype='<U19')
"""
计算分子式,amide 表示多肽序列的 C 端是否酰胺化,append 表示是否将此次得到的计算符添加到已有计算符矩阵中
desc.formula(amide=False,append=False) #可以看到N,H原子数少了,O原子数多了,因为 CONH2 -> COOH
desc.descriptor
"""
array([['C42 H65 N11 O11'],
['C64 H120 N22 O18'],
['C57 H101 N19 O16 S1']], dtype='<U19')
"""
将 amide 设为 False 之后,多肽序列的分子式改变了,因为从 CONH2 变成了 COOH,净增一个 O,减少一个N 和 H,可以从结果中看出
1.2.calculate_charge
这个描述符不太理解,参考的链接也打不开,下面是源码
def calculate_charge(self, ph=7.0, amide=False, append=False):
desc = []
for seq in self.sequences:
desc.append(_charge(seq, ph, amide)) # calculate charge with helper function
desc = np.asarray(desc).reshape(len(desc), 1)
if append:
self.descriptor = np.hstack((self.descriptor, np.array(desc)))
self.featurenames.append('Charge')
else:
self.descriptor = np.array(desc)
self.featurenames = ['Charge']
def _charge(seq, ph=7.0, amide=False):
if amide:
pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
neg_pks = {'Cterm': 15., 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
else:
pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
neg_pks = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
aa_content = count_aas(seq, scale='absolute')
aa_content['Nterm'] = 1.0
aa_content['Cterm'] = 1.0
pos_charge = 0.0
for aa, pK in pos_pks.items():
c_r = 10 ** (pK - ph)
partial_charge = c_r / (c_r + 1.0)
pos_charge += aa_content[aa] * partial_charge
neg_charge = 0.0
for aa, pK in neg_pks.items():
c_r = 10 ** (ph - pK)
partial_charge = c_r / (c_r + 1.0)
neg_charge += aa_content[aa] * partial_charge
return round(pos_charge - neg_charge, 3)
其中 count_aas 效果如下,应该是计数各种氨基酸:
seq='AFDGHLKIA'
count_aas(seq, scale='absolute')
"""
OrderedDict([('A', 2.0),
('C', 0.0),
('D', 1.0),
('E', 0.0),
('F', 1.0),
('G', 1.0),
('H', 1.0),
('I', 1.0),
('K', 1.0),
('L', 1.0),
('M', 0.0),
('N', 0.0),
('P', 0.0),
('Q', 0.0),
('R', 0.0),
('S', 0.0),
('T', 0.0),
('V', 0.0),
('W', 0.0),
('Y', 0.0)])
"""
1.3.isoelectric_point
等电点
desc.isoelectric_point()
desc.descriptor
"""
array([[ 7.703125 ],
[11.49511719],
[10.68164062]])
"""
1.4.instability_index
desc.instability_index()
desc.descriptor
"""
array([[ 2.95 ],
[81.925 ],
[73.24166667]])
"""
1.5.aliphatic_index
脂肪族氨基酸指数
desc.aliphatic_index()
desc.descriptor
"""
array([[110. ],
[ 97.5 ],
[ 73.33333333]])
"""
1.6.boman_index
蛋白相互作用的一种度量,计算方法来自 Antibacterial and antimalarial properties of peptides that are cecropin-melittin hybrids
desc.boman_index()
desc.descriptor
"""
array([[0.42 ],
[4.33 ],
[2.48083333]])
"""
1.7.hydrophobic_ratio
疏水性
desc.hydrophobic_ratio()
desc.descriptor
"""
array([[0.5 ],
[0.25 ],
[0.33333333]])
"""
1.8.calculate_all
计算实现的所有描述符
desc.calculate_all()
desc.descriptor,desc.featurenames
"""
(array([[8.00000000e+00, 8.99060000e+02, 1.03100000e+00, 1.14675328e-03,
1.00253906e+01, 2.95000000e+00, 1.25000000e-01, 1.10000000e+02,
4.20000000e-01, 5.00000000e-01],
[1.20000000e+01, 1.48480000e+03, 4.98800000e+00, 3.35937500e-03,
1.21787109e+01, 8.19250000e+01, 0.00000000e+00, 9.75000000e+01,
4.33000000e+00, 2.50000000e-01],
[1.20000000e+01, 1.33962000e+03, 3.83500000e+00, 2.86275212e-03,
1.15722656e+01, 7.32416667e+01, 0.00000000e+00, 7.33333333e+01,
2.48083333e+00, 3.33333333e-01]]),
['Length',
'MW',
'Charge',
'ChargeDensity',
'pI',
'InstabilityInd',
'Aromaticity',
'AliphaticInd',
'BomanInd',
'HydrophRatio'])
"""
2.PeptideDescriptor
from modlamp.descriptors import *
data=['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']
desc=PeptideDescriptor(data,scalename='pepcats')
desc.descriptor,desc.sequences
"""
(array([], shape=(1, 0), dtype=float64),
['AFDGHLKI', 'KKLQRSDLLRTK', 'KKLASCNNIPPR'])
"""
scalename 可以选择不同的 amino acid descriptor scales,具体可见这里
2.1.calculate_crosscorr
desc.calculate_crosscorr(window=2,append=False),desc.descriptor,desc.descriptor.shape
"""
(None, array([[0.75 , 0.57142857, 0.25 , 0.14285714, 0. ,
0.14285714, 0.25 , 0.14285714, 0.25 , 0.14285714,
0. , 0.14285714, 0.25 , 0. , 0. ,
0.14285714, 0.125 , 0. , 0.125 , 0. ,
0. , 0.14285714, 0.125 , 0. , 0. ,
0. , 0. , 0. , 0.125 , 0. ,
0.25 , 0. , 0.25 , 0. , 0. ,
0. , 0.25 , 0. , 0. , 0. ,
0.125 , 0. ],
[0.66666667, 0.36363636, 0. , 0. , 0. ,
0.27272727, 0.41666667, 0.45454545, 0.41666667, 0.18181818,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.33333333, 0.09090909, 0.25 ,
0.18181818, 0. , 0.18181818, 0.08333333, 0.09090909,
0.66666667, 0.45454545, 0.41666667, 0.27272727, 0. ,
0.09090909, 0.41666667, 0.09090909, 0. , 0. ,
0.08333333, 0. ],
[0.75 , 0.54545455, 0. , 0. , 0.08333333,
0.18181818, 0.33333333, 0.36363636, 0.25 , 0.18181818,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.33333333, 0.27272727, 0.33333333,
0.27272727, 0. , 0. , 0. , 0. ,
0.58333333, 0.36363636, 0.25 , 0.09090909, 0. ,
0. , 0.25 , 0.09090909, 0. , 0. ,
0. , 0. ]]), (3, 42))
"""
2.2.calculate_autocorr
desc.calculate_autocorr(window=2,append=False),desc.descriptor,desc.descriptor.shape
"""
(None, array([[0.75 , 0.25 , 0.125 , 0.25 , 0.25 ,
0.125 , 0.57142857, 0. , 0. , 0. ,
0. , 0. ],
[0.66666667, 0. , 0.33333333, 0.66666667, 0.41666667,
0.08333333, 0.36363636, 0. , 0.09090909, 0.45454545,
0.09090909, 0. ],
[0.75 , 0. , 0.33333333, 0.58333333, 0.25 ,
0. , 0.54545455, 0. , 0.27272727, 0.36363636,
0.09090909, 0. ]]), (3, 12))
"""
2.3.calculate_moment
data=['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']
desc=PeptideDescriptor(data,'eisenberg')
desc.descriptor,desc.sequences
desc.calculate_moment()
desc.descriptor
"""
array([[0.4205886 ],
[0.20845564],
[0.41395658]])
"""
计算这个描述符的时候选择某些 amino acid descriptor scales 会报错
2.4.calculate_global
data=['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']
desc=PeptideDescriptor(data,'pepcats')
desc.descriptor,desc.sequences
desc.calculate_global()
desc.descriptor
"""
array([[1.75 ],
[2.16666667],
[1.91666667]])
"""