一、 Recap切分
主体函数
from rdkit import Chem
from rdkit.Chem import BRICS
import numpy as np
# --------------------- 将分子拆分为不带数字或者自定义 ---------------------
def fragment_recursive(mol, frags):
try:
bonds = list(BRICS.FindBRICSBonds(mol))
if len(bonds) == 0:
frags.append(Chem.MolToSmiles(mol))
return frags
idxs, labs = list(zip(*bonds))
bond_idxs = []
for a1, a2 in idxs:
bond = mol.GetBondBetweenAtoms(a1, a2)
bond_idxs.append(bond.GetIdx())
order = np.argsort(bond_idxs).tolist()
bond_idxs = [bond_idxs[i] for i in order]
broken = Chem.FragmentOnBonds(mol, bondIndices=[bond_idxs[0]], dummyLabels=[(0, 0)])
head, tail = Chem.GetMolFrags(broken, asMols=True)
frags.append(Chem.MolToSmiles(head))
return fragment_recursive(tail, frags)
except Exception as e:
print(e)
pass
# --------------------- 将*号去掉 ---------------------
def remove_dummy(smiles):
try:
stripped_smi=smiles.replace('*','[H]')
mol=Chem.MolFromSmiles(stripped_smi)
return Chem.MolToSmiles(mol)
except Exception as e:
print(e)
return None
1、单个smiles拆分为fragment
# 单个smiles拆分为fragment
aspirin = Chem.MolFromSmiles('C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC(=O)C5=CC=CC=C54)F')
fragments = fragment_recursive(aspirin, [])
clean_fragments = [remove_dummy(smi) for smi in fragments]
print(clean_fragments)
2、csv文件的每一行smiles拆分为fragment + 删除重复的行 + 存入新文件中
# csv文件的每一行smiles拆分为fragment + 删除重复的行
import csv
import pandas as pd
# 读取原始 CSV 文件
with open('csv/double.csv', 'r') as csv_file:
reader = csv.reader(csv_file)
input_data = list(reader)
# 处理每一行数据并将结果保存到列表中
output_data = []
for row in input_data:
try:
aspirin = Chem.MolFromSmiles(row[0])
fragments = fragment_recursive(aspirin, [])
clean_fragments = [remove_dummy(smi) for smi in fragments]
output_data.append(clean_fragments)
except:
print("Current smiles process Error: ", row)
# 存入新的csv文件 + delete duplicate item
new_csv_name = 'output.smi'
df = pd.DataFrame(output_data)
df = df.stack().reset_index(drop=True)
df.to_csv(new_csv_name, index=False, header=False)
df = pd.read_csv(new_csv_name, header=None)
df.drop_duplicates(inplace=True)
df.to_csv(new_csv_name, index=False, header=False)
二、 BRICS切分
rdkit的BRICS算法,BRICS基于常见的反应,选择片段断键的位点,提供了化学合成意义上的可行性。
from rdkit.Chem import Recap
from rdkit.Chem import AllChem as Chem
m = Chem.MolFromSmiles('c1ccccc1OCCOC(=O)CC')
hierarch = Recap.RecapDecompose(m)
#叶子节点函数:hierarch.GetLeaves()
print(hierarch.GetLeaves().keys())
#子孙节点函数:hierarch.GetAllChildren()
print(hierarch.GetAllChildren().keys())
#祖先节点函数,返回列表:getUltimateParents()
print(hierarch.getUltimateParents()[0].smiles)