从uniprot网站上爬取蛋白质家族信息

原始数据:
在这里插入图片描述
逼话少说,上代码

import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from multiprocessing import Pool

"""
类说明:从uniprot网站下获取蛋白的家族信息
Parameters:
    无
Returns:
    无
Modify:
    2020-01-09
"""

class Downloader():
    
    def __init__(self):
        self.server = 'https://www.uniprot.org/uniprot/'
        self.file = r"D:\SMALL_MOLECULAR_AIDS\chembl\outcome\10uM(less)\uniprots(2806).csv"
        self.url = 'https://www.uniprot.org/uniprot/XXX#family_and_domains'
        self.uniprots = []
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0','Upgrade-Insecure-Requests': '1'}
        self.outpath_txt(r"C:\Users\86177\Desktop\families.txt")
    
    """
    函数说明:获取uniprot
    Parameters:
        无
    Returns:
        无
    Modify:
        2020-01-07
    """    
    def get_uni(self):        
        df = pd.read_csv(self.file)
        self.uniprots.extend(df['Uniprots'])
    
    """
    函数说明:获取uniprot号的蛋白质家族信息
    Parameters:
        uniprot
    Returns:
        fam (protein's(uniprot's) family info)
    Modify:
        2020-01-09
    """  
    def get_contents(self, uniprot):
        target = self.url.replace('XXX',uniprot)
        s = requests.session() 
        rep = s.get(url = target,verify = False,headers = self.headers,timeout=30)
        rep.raise_for_status()
        soup = BeautifulSoup(rep.text,'html.parser')
        families = soup.select('#family_and_domains > div.annotation > a')
        fam = [i.get_text() for i in families]
        if fam != []:
            fam.extend([uniprot])
        else:
            families = soup.select('#family_and_domains > span > a ')
            fam = [i.get_text() for i in families]
            fam.extend([uniprot])
        print(fam)
        return fam
        
    
    """
    函数说明:输出数据
    Parameters:
        family(protein's(uniprot's) family info)
    Returns:
        无
    Modify:
        2020-01-07
    """     
    def write_txt(self,family):
        with open(self.outpath_txt, 'a') as f:
            f.write(str(family) + '\n')
    
    """
	函数说明:将爬虫结果映射进原来的csv表格中
	Parameters:
    无
	Returns:
    无
	Modify:
    2020-01-09
	"""
    def trans_csv(self):
    df = pd.read_csv(self.file)
    families = {}
    with open (self.outpath_txt,'r') as f:
        text = f.readlines()
        for f_u in text:
            f_u = f_u.replace('\n','').split(',')
            families.update({f_u[len(f_u)-1]:'|'.join(f_u[0:(len(f_u)-1)])})
        uni_left = [uni for uni in list(df['Uniprots']) if uni not in families.keys()]
        for u in uni_left:
            families.update({u:None})
        df['Family'] = df['Uniprots'].map(lambda x:families[x])
        df.to_csv(self.file)
            
if __name__ == "__main__":
    dl = Downloader()
    dl.get_uni()
    pool = Pool(processes = 8)
    for uniprot in dl.uniprots:
        try:
            pool.apply_async(func = dl.write_txt,args = (dl.get_contents(uniprot),))            
            print("uniprot:"+uniprot+"info download finished")
            time.sleep(0.5)
        except:
            print(uniprot+"download fail")
    pool.close()
    pool.join()
    
    dl.trans_csv()

输出结果:
在这里插入图片描述

可以看到还有一些uniprot没对应到家族,应该是网站结构改变没爬到,再探!!

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值