根据chembl_id 查询蛋白分类信息

最新推荐文章于 2023-08-20 20:26:46 发布

贺俊宏

最新推荐文章于 2023-08-20 20:26:46 发布

阅读量1.5k

点赞数 2

分类专栏：药用数据库的数据提取与使用

本文链接：https://blog.csdn.net/recher_He1107/article/details/108595282

版权

药用数据库的数据提取与使用专栏收录该内容

18 篇文章 59 订阅

订阅专栏

记录笨比上班生活又一天

没啥好说的直接上代码

以一个含有chembl target ID号的表格为起点,输入输入文件地址,chembl_id列名,以及输出文件地址
chembl的api 分为多层 chembl_id>>>>>componen_id>>>>>protein_id

# -*- coding: utf-8 -*-
"""
Created on Mon Sep 14 21:52:27 2020

@author: recherHE
@Email:recherHE.@163.COM
"""

import requests
import bs4
import json
import re
import pandas as pd
import multiprocessing as mp

Q_URL = "https://www.ebi.ac.uk/chembl/api/data/target/search?q={}"
id_URL ="https://www.ebi.ac.uk/chembl/api/data/target/{}"
COM_URL = "https://www.ebi.ac.uk/chembl/api/data/target_component/{}.json"
PRO_URL = "https://www.ebi.ac.uk/chembl/api/data/protein_class/{}.json"


def get_id(file_path,cb_column):
# 获取chembl_id号
    df = pd.read_csv(file_path)
    cb_id_list = df[cb_column]
    
    return cb_id_list

def get_expid(type):
    # 获取某类型的一个ChEMBLID和component_id
    url = Q_URL.format(type)
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    chid = soup.find("target_chembl_id")
    coid = soup.find("component_id")
    ch_compile= re.compile(r'<target_chembl_id>(.*)</target_chembl_id>')
    co_compile = re.compile(r'<component_id>(\d+)</component_id>')
    if not chid:
        print("None", type) # 这个API某些类型的查询返回了空值
        return None, None
    print(chid, coid)
    chid = re.findall(ch_compile,str(chid))[0]
    coid = re.findall(co_compile,str(coid))[0]
    return chid, coid

def get_comid(cbid):   
    #以chembl_id 寻找component_id
    url = Q_URL.format(cbid)
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    coid = soup.find('component_id')
    co_compile = re.compile(r'<component_id>(\d+)</component_id>')
    print (coid)
    coid = re.findall(co_compile,str(coid))[0]
    return coid

def get_com_data(coid):
    # 获取component_id对应的protein_classification_id
    url = COM_URL.format(coid)
    r = requests.get(url)
    page = r.text
    text = json.loads(page)
    proid = text["protein_classifications"][0]["protein_classification_id"]
    return proid

def get_pro_data(proid):
    # 获取Protein Target Classification
    url = PRO_URL.format(proid)
    r = requests.get(url)
    page = r.text
    text = json.loads(page)
    prolist = []
    for i in range(1, 9):
        s = "l{}".format(i)
        spro = text[s]
        if spro:
            prolist.append(spro)
    protype = " > ".join(prolist)
    return protype

def main(cbid,out_path):
#    以类型为起点寻找家族分类
#    stype = "ALB"
#    chid, coid = get_expid(stype)

#    以具体的target_id号寻找分类

    coid = get_comid(cbid)
    proid = get_com_data(coid)
    protype = get_pro_data(proid)
    print("sucessed:", cbid, protype)
    with open (out_path,'ab') as f:
        f.write((cbid+'\t'+protype+'\n').encode())
    
if __name__ == "__main__":
    file_path = input("请输入文件地址:")
    cb_column = input("请输入CHEMBL_ID的列名:")
    out_path =  input("请输入输出文件地址:")
    cb_id_list = get_id(file_path,cb_column)
    pool = mp.Pool()
    for cbid in cb_id_list:
        pool.apply_async(main,args =(cbid,out_path,))
    pool.close()
    pool.join()

运行效果

在这里插入图片描述

运行结果

在这里插入图片描述

贺俊宏

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
根据chembl_id 查询蛋白分类信息

记录笨比上班生活又一天没啥好说的直接上代码运行效果运行结果没啥好说的直接上代码以一个含有chembl target ID号的表格为起点,输入输入文件地址,chembl_id列名,以及输出文件地址chembl的api 分为多层 chembl_id>>>>>componen_id>>>>>protein_id# -*- coding: utf-8 -*-"""Created on Mon Sep 14 21:52:27 2020@au
复制链接

扫一扫

专栏目录