记录笨比上班生活又一天
没啥好说的直接上代码
以一个含有chembl target ID号的表格为起点,输入输入文件地址,chembl_id列名,以及输出文件地址
chembl的api 分为多层 chembl_id>>>>>componen_id>>>>>protein_id
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 14 21:52:27 2020
@author: recherHE
@Email:recherHE.@163.COM
"""
import requests
import bs4
import json
import re
import pandas as pd
import multiprocessing as mp
Q_URL = "https://www.ebi.ac.uk/chembl/api/data/target/search?q={}"
id_URL ="https://www.ebi.ac.uk/chembl/api/data/target/{}"
COM_URL = "https://www.ebi.ac.uk/chembl/api/data/target_component/{}.json"
PRO_URL = "https://www.ebi.ac.uk/chembl/api/data/protein_class/{}.json"
def get_id(file_path,cb_column):
# 获取chembl_id号
df = pd.read_csv(file_path)
cb_id_list = df[cb_column]
return cb_id_list
def get_expid(type):
# 获取某类型的一个ChEMBLID和component_id
url = Q_URL.format(type)
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
chid = soup.find("target_chembl_id")
coid = soup.find("component_id")
ch_compile= re.compile(r'<target_chembl_id>(.*)</target_chembl_id>')
co_compile = re.compile(r'<component_id>(\d+)</component_id>')
if not chid:
print("None", type) # 这个API某些类型的查询返回了空值
return None, None
print(chid, coid)
chid = re.findall(ch_compile,str(chid))[0]
coid = re.findall(co_compile,str(coid))[0]
return chid, coid
def get_comid(cbid):
#以chembl_id 寻找component_id
url = Q_URL.format(cbid)
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
coid = soup.find('component_id')
co_compile = re.compile(r'<component_id>(\d+)</component_id>')
print (coid)
coid = re.findall(co_compile,str(coid))[0]
return coid
def get_com_data(coid):
# 获取component_id对应的protein_classification_id
url = COM_URL.format(coid)
r = requests.get(url)
page = r.text
text = json.loads(page)
proid = text["protein_classifications"][0]["protein_classification_id"]
return proid
def get_pro_data(proid):
# 获取Protein Target Classification
url = PRO_URL.format(proid)
r = requests.get(url)
page = r.text
text = json.loads(page)
prolist = []
for i in range(1, 9):
s = "l{}".format(i)
spro = text[s]
if spro:
prolist.append(spro)
protype = " > ".join(prolist)
return protype
def main(cbid,out_path):
# 以类型为起点寻找家族分类
# stype = "ALB"
# chid, coid = get_expid(stype)
# 以具体的target_id号寻找分类
coid = get_comid(cbid)
proid = get_com_data(coid)
protype = get_pro_data(proid)
print("sucessed:", cbid, protype)
with open (out_path,'ab') as f:
f.write((cbid+'\t'+protype+'\n').encode())
if __name__ == "__main__":
file_path = input("请输入文件地址:")
cb_column = input("请输入CHEMBL_ID的列名:")
out_path = input("请输入输出文件地址:")
cb_id_list = get_id(file_path,cb_column)
pool = mp.Pool()
for cbid in cb_id_list:
pool.apply_async(main,args =(cbid,out_path,))
pool.close()
pool.join()