python_getUniport_从uniport网站中获得指定的字节

  1. 从网站中或得指定字段
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from multiprocessing import Pool


class Downloader():
    def __init__(self):
        self.server = 'https://www.uniprot.org/uniprot/'
        self.file = r"uniprots_input.csv"
        #self.url = 'https://www.uniprot.org/uniprot/XXX#family_and_domains'
        self.url = 'https://www.uniprot.org/uniprot/XXX' 
        #网站链接
        self.uniprots = []
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
                        'Upgrade-Insecure-Requests': '1'}
        self.outpath_txt = r"uniprots_output.csv"

    def get_uni(self):
        df = pd.read_csv(self.file)
        # print(df)
        # print("***************")
        # print(df.columns)
        self.uniprots.extend(df['Uniports'])

    def get_contents(self, uniprot):
        target = self.url.replace('XXX', uniprot)
        s = requests.session()
        rep = s.get(url=target, verify=False, headers=self.headers, timeout=30)
        rep.raise_for_status()
        soup = BeautifulSoup(rep.text, 'html.parser')
        #families = soup.select('#family_and_domains > div.annotation > a')
        families = soup.select('#content-gene')

        fam = [i.get_text() for i in families]
        if fam != []:
            fam.extend([uniprot])
        else:
            families = soup.select('#content-gene')
            fam = [i.get_text() for i in families]
            fam.extend([uniprot])
        print(fam)
        return fam

#写入文件
    def write_txt(self, family):
        with open(self.outpath_txt, 'a') as f:
            f.write(str(family) + '\n')

    def trans_csv(self):
        df = pd.read_csv(self.file)
        families = {}
        with open(self.outpath_txt, 'r') as f:
            text = f.readlines()
            for f_u in text:
                f_u = f_u.replace('\n', '').split(',')
                families.update({f_u[len(f_u) - 1]: '|'.join(f_u[0:(len(f_u) - 1)])})
                uni_left = [uni for uni in list(df['Uniports']) if uni not in families.keys()]
            for u in uni_left:
                families.update({u: None})
                df['Family'] = df['Uniports'].map(lambda x: families[x])
                df.to_csv(self.file)


if __name__ == "__main__":
    dl = Downloader()
    dl.get_uni()
    pool = Pool(processes=8)
    for uniprot in dl.uniprots:
        try:
            pool.apply_async(func=dl.write_txt, args=(dl.get_contents(uniprot),))
            print("uniprot:" + uniprot + "info download finished")
            time.sleep(0.5)
        except:
            print(uniprot + "download fail")
    pool.close()
    pool.join()
    dl.trans_csv()

在这里插入图片描述
unpiortID-A1Z199

在这里插入图片描述
获得对应的基因号码

注解:python3.7
参考文献:https://blog.csdn.net/recher_He1107/article/details/103888869

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值