简单爬虫----NCBI上获取品种信息

简单爬虫实践

手里有了RUNID,需要收集这些测序生物样本的品种等相关信息。

记录于此,有待改善。

'''
@author:yanglv
202011720:18:49
简单爬虫---根据表格中的runid,在NCBI爬取Biosample信息,并写入表格。
'''

import requests
import re
from bs4 import BeautifulSoup
import xlrd
from xlutils.copy import copy
import time
from retrying import retry


def open_excel(excel_name):
    # 打开exce文档,读取SRR号码,SRR列表
    data = xlrd.open_workbook(excel_name)
    sheet1 = data.sheet_by_name('Sheet1')
    srrnumber = sheet1.col_values(0)  # 第1列信息
    i = jump(excel_name) + 1
    srrnumber = srrnumber[i:]
    return srrnumber


def qukongge(mylist=None):
    get = mylist
    newlist = []
    for item in get:
        if item != ' ':
            newlist.append(item)
    if newlist == []:
        newlist = [' ']
    return newlist


def jump(excel_name):
    # 跳过已经写入的
    data = xlrd.open_workbook(excel_name)
    sheet = data.sheet_by_name('Sheet1')
    ihad1 = sheet.col_values(1)  # 读取breed列看进度
    ihad2 = ihad1[1:]
    ihad = []
    for item in ihad2:
        if item != '':  # 去除空元素
            ihad.append(item)
    ihadnumber = len(ihad)
    print("我已经有了的数据数为: " + str(ihadnumber))
    return ihadnumber

@retry()
def get_beefinformation(excel_name):
    # 主方法:查表excel_name中Sheet1列的品种名称。
    srrhaoma = open_excel(excel_name)
    line = jump(excel_name)
    for srr in srrhaoma:
        print(srr)
        url = 'https://www.ncbi.nlm.nih.gov/sra/?term=' + srr
        print("现在进行的网页为" + url)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        response = requests.get(url, headers)
        htmlfile = BeautifulSoup(response.text, 'lxml')
        sample = htmlfile.select('#ResultView > div:nth-child(4) > span > div > a:nth-child(1)')
        if sample == []:
            sample = htmlfile.select('#ResultView > div:nth-child(3) > span > div > a:nth-child(1)')
        print(str(sample))
        result1 = str(sample[0])
        link = re.findall('/\w+\/\w+', result1)
        link1 = link[0]
        print(link1)
        sampleID = re.findall('SAMN\d+', result1)
        sampleID = sampleID[0]
        print(sampleID)
        time.sleep(1)
        url = 'https://www.ncbi.nlm.nih.gov' + link1
        print("生物样本网页" + url)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        response1 = requests.get(url, headers)
        htmlfile = BeautifulSoup(response1.text, 'lxml')
        mydata = htmlfile.select('#maincontent > div > div:nth-child(5) > div > div.docsum > dl:nth-child(4) > dd')
        if mydata == []:
            mydata = htmlfile.select('#maincontent > div > div:nth-child(5) > div > div.docsum > dl:nth-child(3) > dd')
        print("-----生物样本数据------")
        print(mydata)
        mydata = str(mydata[0])
        sampleid = re.findall('SAMN\d+', sampleID)
        breed2 = re.findall('breed</th><td>.{0,15}\s</td>',mydata)
        breed1 = qukongge(breed2)
        breed = re.sub('breed</th><td>|</td>', '', breed1[0])
        print("breed值" + breed)
        age2 = re.findall('age</th><td>.{0,15}\s</td>|\s', mydata)
        age1 = qukongge(age2)
        age = re.sub('age</th><td>|\s</td>|\s', '', age1[0])
        print("age值" + age)
        sex2 = re.findall('sex</th><td>.{0,15}\s</td>|\s', mydata)
        sex1 = qukongge(sex2)
        sex = re.sub('sex</th><td>|\s</td>|\s', '', sex1[0])
        print("sex值" + sex)
        tissue2 = re.findall('tissue</th><td>.{0,15}</td>|\s', mydata)
        tissue1 = qukongge(tissue2)
        tissue = re.sub('tissue</th><td>|\s</td>|\s', '', tissue1[0])
        print("tissue值" + tissue)
        biomaterrial2 = re.findall('biomaterial\sprovider</th><td>.{0,15}\s</td>|\s', mydata)
        biomaterrial1 = qukongge(biomaterrial2)
        biomaterrial = re.sub('biomaterial\sprovider</th><td>|\s</td>|\s', '', biomaterrial1[0])
        print("biomaterial值" + biomaterrial)
        sampletyple2 = re.findall('sample\stype</th><td>.{0,15}\s</td>|\s', mydata)
        sampletyple1 = qukongge(sampletyple2)
        sampletyple = re.sub('sample\stype</th><td>|\s</td>|\s', '', sampletyple1[0])
        treatment2 = re.findall('treatment</th><td>.{0,15}\s</td>|\s', mydata)
        treatment1 = qukongge(treatment2)
        treatment = re.sub('treatment</th><td>|\s</td>|\s', '', treatment1[0])
        # 写入
        data = xlrd.open_workbook(excel_name)
        pydata = copy(data)  # copy方法进入python
        pydatasheet = pydata.get_sheet(0)
        line = line + 1
        pydatasheet.write(line, 1, breed)
        pydatasheet.write(line, 2, sampleid)  # 第三列,sampleid写入
        pydatasheet.write(line, 3, age)  # 第四列,age写入
        pydatasheet.write(line, 4, sex)  # 第5列sex写入
        pydatasheet.write(line, 5, tissue)  # 第6列tissue写入
        pydatasheet.write(line, 6, biomaterrial)  # 第7列biomaterial provider写入
        pydatasheet.write(line, 7, sampletyple)  # 第8列sampletype写入
        pydatasheet.write(line, 8, treatment)  # 第9列treatm写入
        print("现在完成了第" + str(line + 1) + "行")
        time.sleep(0.5)
        pydata.save(excel_name)


if __name__ == "__main__":
    excel_name = '统计b502其他116个样本的breed.xls'
    get_beefinformation(excel_name)

  • 6
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值