分子描述符计算
通过.sml文件和.sdf文件计算分子描述符,并通过pandas将分子描述符、SMILES表达式、分子物化性质合并成一个csv文件。
1.通过sml文件计算描述符生成csv文件
数据文件是一个有4333个分子的熔点数据的csv文件。
SMILES | melting_point | |
---|---|---|
sample_1 | O=C1Cc2ccccc21 | 14.0 |
sample_2 | Clc1ccc(cc1)C1c2c(OC(N)=C1C#N)[nH][nH0]c2C(F)(F)F | 20.5 |
… | … | … |
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
dataset = pd.read_csv('molecules_with_melting_point.csv', index_col = 0)
smiles = dataset.iloc[:, 0] # 获取分子的 SMILES表达式
y = dataset.iloc[:, 1] # 物性:meliting_point
# get the name of calculated discriptors
descriptor_names = []
for descriptor_information in Descriptors.descList:
descriptor_names.append(descriptor_information[0]) # 第一列列名
print("The num of descriptors:", len(descriptor_names))
# calculate
descriptors = []
descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
print("num of molcular:%d" % len(smiles))
for index, smiles_i in enumerate(smiles):
# print(index + 1, '/' ,len(smiles))
molecule = Chem.MolFromSmiles(smiles_i)
descriptors.append(descriptor_calculator.CalcDescriptors(molecule))
descriptors = pd.DataFrame(descriptors, index = dataset.index, columns = descriptor_names)
descriptors_with_y = pd.concat([y, descriptors], axis=1) # 把 y和描述符结合起来
descriptors_with_y.to_csv('descriptors_with_y.csv')
2.通过.sdf文件
.sdf文件格式:
第一行:一般作为分子名字,如 Levetiracetam
第二行:注释,ChemDraw06111413562D
第三行:一般是空行
第四行:是原子个数 键的个数等的起始行
属性1
属性1的值
如:
RDKit 2D
9 10 0 0 0 0 0 0 0 0999 V2000
-3.1404 0.7862 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
-1.8315 0.0534 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.4242 -1.3902 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.0195 -0.9828 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1.4734 -1.3519 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
2.5199 -0.2773 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
2.1125 1.1664 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.6586 1.5354 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.3879 0.4608 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0
2 3 1 0
3 4 1 0
4 5 2 0
5 6 1 0
6 7 2 0
7 8 1 0
8 9 2 0
9 2 1 0
9 4 1 0
M END
> <melting_point> (1)
14.0
$$$$
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
y_name = 'melting_point'
sdf = Chem.SDMolSupplier('melting_point.sdf')
# get the name of calculated discriptors
descriptor_names = []
for descriptor_information in Descriptors.descList:
descriptor_names.append(descriptor_information[0])
print("The num of descriptors:", len(descriptor_names))
descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
descriptors, y, smiles = [], [], []
print("num of molecualr:%d" % len(sdf))
for index,molecule in enumerate(sdf):
y.append(float(molecule.GetProp(y_name)))
descriptors.append(descriptor_calculator.CalcDescriptors(molecule))
smiles.append(Chem.MolToSmiles(molecule))
descriptors = pd.DataFrame(descriptors, index=smiles, columns = descriptor_names)
y = pd.DataFrame(y, index=smiles, columns=[y_name])
descriptors_with_y = pd.concat([y, descriptors], axis=1)
descriptors_with_y.to_csv('descriptors_with_y.csv')