根据chembl_id 查询蛋白分类信息

最新推荐文章于 2023-08-20 20:26:46 发布

贺俊宏

最新推荐文章于 2023-08-20 20:26:46 发布

阅读量1.5k

点赞数 2

分类专栏：药用数据库的数据提取与使用

本文链接：https://blog.csdn.net/recher_he1107/article/details/108595282

版权

药用数据库的数据提取与使用专栏收录该内容

18 篇文章 59 订阅

订阅专栏

记录笨比上班生活又一天

没啥好说的直接上代码

以一个含有chembl target ID号的表格为起点,输入输入文件地址,chembl_id列名,以及输出文件地址
chembl的api 分为多层 chembl_id>>>>>componen_id>>>>>protein_id

# -*- coding: utf-8 -*-
"""
Created on Mon Sep 14 21:52:27 2020

@author: recherHE
@Email:recherHE.@163.COM
"""

import requests
import bs4
import json
import re
import pandas as pd
import multiprocessing as mp

Q_URL = "https://www.ebi.ac.uk/chembl/api/data/target/search?q={}"
id_URL ="https://www.ebi.ac.uk/chembl/api/data/target/{}"
COM_URL = "https://www.ebi.ac.uk/chembl/api/data/target_component/{}.json"
PRO_URL = "https://www.ebi.ac.uk/chembl/api/data/protein_class/{}.json"


def get_id(file_path,cb_column):
# 获取chembl_id号
    df = pd.read_csv(file_path)
    cb_id_list = df[cb_column]
    
    return cb_id_list

def get_expid(type):
    # 获取某类型的一个ChEMBLID和component_id
    url = Q_URL.format(type)
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    chid = soup.find("target_chembl_id")
    coid = soup.find("component_id")
    ch_compile= re.compile(r'<target_chembl_id>(.*)</target_chembl_id>')
    co_compile = re.compile(r'<component_id>(\d+)</component_id>')
    if not chid:
        print("None", type) # 这个API某些类型的查询返回了空值
        return None, None
    print(chid, coid)
    chid = re.findall(ch_compile,str(chid))[0]
    coid = re.findall(co_compile,str(coid))[0]
    return chid, coid

def get_comid(cbid):   
    #以chembl_id 寻找component_id
    url = Q_URL.format(cbid)
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    coid = soup.find('component_id')
    co_compile = re.compile(r'<component_id>(\d+)</component_id>')
    print (coid)
    coid = re.findall(co_compile,str(coid))[0]
    return coid

def get_com_data(coid):
    # 获取component_id对应的protein_classification_id
    url = COM_URL.format(coid)
    r = requests.get(url)
    page = r.text
    text = json.loads(page)
    proid = text["protein_classifications"][0]["protein_classification_id"]
    return proid

def get_pro_data(proid):
    # 获取Protein Target Classification
    url = PRO_URL.format(proid)
    r = requests.get(url)
    page = r.text
    text = json.loads(page)
    prolist = []
    for i in range(1, 9):
        s = "l{}".format(i)
        spro = text[s]
        if spro:
            prolist.append(spro)
    protype = " > ".join(prolist)
    return protype

def main(cbid,out_path):
#    以类型为起点寻找家族分类
#    stype = "ALB"
#    chid, coid = get_expid(stype)

#    以具体的target_id号寻找分类

    coid = get_comid(cbid)
    proid = get_com_data(coid)
    protype = get_pro_data(proid)
    print("sucessed:", cbid, protype)
    with open (out_path,'ab') as f:
        f.write((cbid+'\t'+protype+'\n').encode())
    
if __name__ == "__main__":
    file_path = input("请输入文件地址:")
    cb_column = input("请输入CHEMBL_ID的列名:")
    out_path =  input("请输入输出文件地址:")
    cb_id_list = get_id(file_path,cb_column)
    pool = mp.Pool()
    for cbid in cb_id_list:
        pool.apply_async(main,args =(cbid,out_path,))
    pool.close()
    pool.join()