大量的pubchem实验获取化合物smile并归属各个实验.

目标:获取了多个pubchem实验的csv文件,需要获取其中化合物的smile.

import pandas as pd
import os
import re

def Summary_cid(path,out_path):
	# cid sumamry and batches to files that each has 500000 cid 
	files = os.listdir(path)
	CIDS = []
	for file in files:
		df = pd.read_csv(os.path.join(path,file))
		df = df.dropna(axis=0,subset = ["PUBCHEM_CID"]) 
	    CIDS.extend(df['PUBCHEM_CID'])
	for i in range(0,len(CIDS),500000):
		df = pd.DataFrame({'CID':CIDS[i:i+500001]}).to_csv(os.path.join(out_path,'{}-{}.csv'.format(i,i+500000)))


# 通过csv文件在pubchem网站上下载smiles_txt.

def get_relationship_dic(path):
    #make diction between cids and smiles
    dic = {} 
    for file in os.listdir(path):
        with open (os.path.join(path,file),'r+') as f:
            smiles = f.readlines()
            smiles_only = [i.replace('\t',',').split(',')[1].strip() for i in smiles]
            CID_only =  [i.replace('\t',',').split(',')[0].strip() for i in smiles]
            dic.update({CID:SMILE for (CID,SMILE) in zip(CID_only,smiles_only)})
     return dic

def append_smile_column(path):
    # append smile column to origin file by the diction which made before
	files = os.listdir(path)
    for file in files:
        print('{} started'.format(file))
        df = pd.read_csv(os.path.join(path,file)).dropna(axis = 0,subset=['PUBCHEM_CID'])
        df['SMILES'] = df['PUBCHEM_CID'].map(lambda x:dic[str(int(x))])
        df.to_csv(os.path.join(path,file),index =False)
        print('{} finished'.format(file))

if __name__ == '__main__':
	ori_path = (input by yourself)
	out_path = (input by yourseld)
	txt_path = (input by yourseld)
	Summary_cid(ori_path,out_path) 
	dic = get_relationship_dic(txt_path)
	append_smile_column(ori_path)      		
  • 3
    点赞
  • 21
    收藏
    觉得还不错? 一键收藏
  • 7
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值