目标:获取了多个pubchem实验的csv文件,需要获取其中化合物的smile.
import pandas as pd
import os
import re
def Summary_cid(path,out_path):
# cid sumamry and batches to files that each has 500000 cid
files = os.listdir(path)
CIDS = []
for file in files:
df = pd.read_csv(os.path.join(path,file))
df = df.dropna(axis=0,subset = ["PUBCHEM_CID"])
CIDS.extend(df['PUBCHEM_CID'])
for i in range(0,len(CIDS),500000):
df = pd.DataFrame({'CID':CIDS[i:i+500001]}).to_csv(os.path.join(out_path,'{}-{}.csv'.format(i,i+500000)))
# 通过csv文件在pubchem网站上下载smiles_txt.
def get_relationship_dic(path):
#make diction between cids and smiles
dic = {}
for file in os.listdir(path):
with open (os.path.join(path,file),'r+') as f:
smiles = f.readlines()
smiles_only = [i.replace('\t',',').split(',')[1].strip() for i in smiles]
CID_only = [i.replace('\t',',').split(',')[0].strip() for i in smiles]
dic.update({CID:SMILE for (CID,SMILE) in zip(CID_only,smiles_only)})
return dic
def append_smile_column(path):
# append smile column to origin file by the diction which made before
files = os.listdir(path)
for file in files:
print('{} started'.format(file))
df = pd.read_csv(os.path.join(path,file)).dropna(axis = 0,subset=['PUBCHEM_CID'])
df['SMILES'] = df['PUBCHEM_CID'].map(lambda x:dic[str(int(x))])
df.to_csv(os.path.join(path,file),index =False)
print('{} finished'.format(file))
if __name__ == '__main__':
ori_path = (input by yourself)
out_path = (input by yourseld)
txt_path = (input by yourseld)
Summary_cid(ori_path,out_path)
dic = get_relationship_dic(txt_path)
append_smile_column(ori_path)