用 rdkit 的 doRandom 功能进行批量数据扩增
1. 输入数据
from rdkit import Chem
import pandas as pd
import numpy as np
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
data = pd.read_csv(r"C:\Users\29351\Desktop\rnn\all-database-50-100-standard-rnn.csv",encoding = 'gbk')
#转化为rdkit格式的标准smiles
error_smiles = []
for i in range(len(data)):
ori_smiles = data['SMILES'][i]
try:
standard_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ori_smiles),True)
data['SMILES'][i] = standard_smiles
except:
error_number.append(i)
error_smiles.append(ori_smiles)
2. 扩增数据
data['smi_10'] = data['SMILES']
for i in range(len(data)):
smiles_one = data['SMILES'][i]
mol = Chem.MolFromSmiles(smiles_one,True)
for j in range(10):
smi = Chem.MolToSmiles(mol, doRandom=True)
smiles.append(smi)
data['smi_10'][i] = data['smi_10'][i] + ',' + smi
3.输出data