分子数据前期清洗流程
import pandas as pd
import rdkit
from rdkit import Chem
from standardiser import standardise
import logging
from rdkit.Chem import Descriptors
METAL_ELEMENTS = ['Li', 'Be', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co',
'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh',
'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Cs', 'Ba', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os',
'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'Fr', 'Ra', 'Lr', 'Ho']
df = pd.read_csv('tubulin.csv')
df.dropna(axis=0, subset=["Smiles"], inplace=True)
for metal in METAL_ELEMENTS:
df = df[~df['Smiles'].str.contains(metal)]
print("no {}".format(metal),df.shape)
for i in df.index:
try:
smi = df.loc[i, 'Smiles']
mol = Chem.MolFromSmiles(smi)
mol = Chem.AddHs(mol)
parent = standardise.run(mol)
mol_ok_smi = Chem.MolToSmiles(parent)
df.loc[i, 'Smiles'] = mol_ok_smi
except standardise.StandardiseException as e:
logging.warning(e.message)
df.drop_duplicates(keep='first', inplace=True)
print('删除重复值df:', df.shape)
df1=df[df['Standard Type'].isin(["IC50"])]
df3=df[df['Standard Type'].isin(["Ki"])]
df4=df[df['Standard Type'].isin(["Kd"])]
frames = [df1, df3, df4]
df = pd.concat(frames)
df=df[df['Standard Relation'].isin(["'='"])]
print('删除非=的df:', df.shape)
df=df[df['Assay Type'].isin(["B"])]
print('删除非B的df:', df.shape)
molweight = []
for smi in list(df['Smiles']):
molweight.append(Descriptors.MolWt(Chem.MolFromSmiles(smi)))
df['molecular_weight'] = molweight
df = df[ df['molecular_weight']<=1000 ]
logP = []
for smi in list(df['Smiles']):
logP.append(Descriptors.MolLogP(Chem.MolFromSmiles(smi)))
df['logP'] = logP
df2 = df[df['Standard Units'].isin(["nM"])]
print('单位为nM的df:', df2.shape)
df3 = df[df['Standard Units'].isin(["ug.mL-1"])]
print('单位为ug/ml的df:', df3.shape)
list(df3['Standard Value'])
df3['Standard Value'] = df3['Standard Value']/df3['molecular_weight']*1000000
df3['Standard Units'] = "nM"
df_mean = df.groupby('Molecule ChEMBL ID')['Standard Value'].mean()
print('df',df_mean.shape)
df_mean_dict = df_mean.to_dict()
df['standard_value_mean'] = df['Molecule ChEMBL ID'].apply(lambda x:df_mean_dict[x])
df.drop('Standard Value', axis=1, inplace=True)
df.drop_duplicates(subset=['Molecule ChEMBL ID'],
inplace=True)
df.reset_index(inplace=True)
print('取平均值后的df:',df.shape)
df.loc[df['standard_value_mean']<=10000,'standard_value_mean']=1
df.loc[df['standard_value_mean']>10000,'standard_value_mean']=0
print('转换平均值后的df:',df.shape)
df = pd.DataFrame(df,columns = ['Smiles','standard_value_mean'])
df.columns = ['SMILES','LABEL']
df.to_csv('data.csv', index=None)
from rdkit.Chem import Descriptors
import numpy as np
def get_rdkit_des(mol):
features = []
for desc_name, function in Descriptors.descList:
if desc_name == 'Ipc':
feature = function(mol, avg=True)
else:
feature = function(mol)
features.append(feature)
return np.asarray(features)
X = np.array([get_rdkit_des(Chem.MolFromSmiles(smi)) for smi in list(df.iloc[:, 0])])