简单爬虫----NCBI上获取品种信息_怎样查一个biosample的样品来源-CSDN博客

本文链接：https://blog.csdn.net/qq_43206685/article/details/109551841
简单爬虫实践

手里有了RUNID，需要收集这些测序生物样本的品种等相关信息。

记录于此，有待改善。
'''
@author:yanglv
2020年11月7日20:18:49
简单爬虫---根据表格中的runid，在NCBI爬取Biosample信息，并写入表格。
'''

import requests
import re
from bs4 import BeautifulSoup
import xlrd
from xlutils.copy import copy
import time
from retrying import retry


def open_excel(excel_name):
    # 打开exce文档,读取SRR号码,得SRR列表
    data = xlrd.open_workbook(excel_name)
    sheet1 = data.sheet_by_name('Sheet1')
    srrnumber = sheet1.col_values(0)  # 第1列信息
    i = jump(excel_name) + 1
    srrnumber = srrnumber[i:]
    return srrnumber


def qukongge(mylist=None):
    get = mylist
    newlist = []
    for item in get:
        if item != ' ':
            newlist.append(item)
    if newlist == []:
        newlist = [' ']
    return newlist


def jump(excel_name):
    # 跳过已经写入的
    data = xlrd.open_workbook(excel_name)
    sheet = data.sheet_by_name('Sheet1')
    ihad1 = sheet.col_values(1)  # 读取breed列看进度
    ihad2 = ihad1[1:]
    ihad = []
    for item in ihad2:
        if item != '':  # 去除空元素
            ihad.append(item)
    ihadnumber = len(ihad)
    print("我已经有了的数据数为： " + str(ihadnumber))
    return ihadnumber

@retry()
def get_beefinformation(excel_name):
    # 主方法:查表excel_name中Sheet1列的品种名称。
    srrhaoma = open_excel(excel_name)
    line = jump(excel_name)
    for srr in srrhaoma:
        print(srr)
        url = 'https://www.ncbi.nlm.nih.gov/sra/?term=' + srr
        print("现在进行的网页为" + url)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        response = requests.get(url, headers)
        htmlfile = BeautifulSoup(response.text, 'lxml')
        sample = htmlfile.select('#ResultView > div:nth-child(4) > span > div > a:nth-child(1)')
        if sample == []:
            sample = htmlfile.select('#ResultView > div:nth-child(3) > span > div > a:nth-child(1)')
        print(str(sample))
        result1 = str(sample[0])
        link = re.findall('/\w+\/\w+', result1)
        link1 = link[0]
        print(link1)
        sampleID = re.findall('SAMN\d+', result1)
        sampleID = sampleID[0]
        print(sampleID)
        time.sleep(1)
        url = 'https://www.ncbi.nlm.nih.gov' + link1
        print("生物样本网页" + url)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        response1 = requests.get(url, headers)
        htmlfile = BeautifulSoup(response1.text, 'lxml')
        mydata = htmlfile.select('#maincontent > div > div:nth-child(5) > div > div.docsum > dl:nth-child(4) > dd')
        if mydata == []:
            mydata = htmlfile.select('#maincontent > div > div:nth-child(5) > div > div.docsum > dl:nth-child(3) > dd')
        print("-----生物样本数据------")
        print(mydata)
        mydata = str(mydata[0])
        sampleid = re.findall('SAMN\d+', sampleID)
        breed2 = re.findall('breed</th><td>.{0,15}\s</td>',mydata)
        breed1 = qukongge(breed2)
        breed = re.sub('breed</th><td>|</td>', '', breed1[0])
        print("breed值" + breed)
        age2 = re.findall('age</th><td>.{0,15}\s</td>|\s', mydata)
        age1 = qukongge(age2)
        age = re.sub('age</th><td>|\s</td>|\s', '', age1[0])
        print("age值" + age)
        sex2 = re.findall('sex</th><td>.{0,15}\s</td>|\s', mydata)
        sex1 = qukongge(sex2)
        sex = re.sub('sex</th><td>|\s</td>|\s', '', sex1[0])
        print("sex值" + sex)
        tissue2 = re.findall('tissue</th><td>.{0,15}</td>|\s', mydata)
        tissue1 = qukongge(tissue2)
        tissue = re.sub('tissue</th><td>|\s</td>|\s', '', tissue1[0])
        print("tissue值" + tissue)
        biomaterrial2 = re.findall('biomaterial\sprovider</th><td>.{0,15}\s</td>|\s', mydata)
        biomaterrial1 = qukongge(biomaterrial2)
        biomaterrial = re.sub('biomaterial\sprovider</th><td>|\s</td>|\s', '', biomaterrial1[0])
        print("biomaterial值" + biomaterrial)
        sampletyple2 = re.findall('sample\stype</th><td>.{0,15}\s</td>|\s', mydata)
        sampletyple1 = qukongge(sampletyple2)
        sampletyple = re.sub('sample\stype</th><td>|\s</td>|\s', '', sampletyple1[0])
        treatment2 = re.findall('treatment</th><td>.{0,15}\s</td>|\s', mydata)
        treatment1 = qukongge(treatment2)
        treatment = re.sub('treatment</th><td>|\s</td>|\s', '', treatment1[0])
        # 写入
        data = xlrd.open_workbook(excel_name)
        pydata = copy(data)  # copy方法进入python
        pydatasheet = pydata.get_sheet(0)
        line = line + 1
        pydatasheet.write(line, 1, breed)
        pydatasheet.write(line, 2, sampleid)  # 第三列，sampleid写入
        pydatasheet.write(line, 3, age)  # 第四列，age写入
        pydatasheet.write(line, 4, sex)  # 第5列sex写入
        pydatasheet.write(line, 5, tissue)  # 第6列tissue写入
        pydatasheet.write(line, 6, biomaterrial)  # 第7列biomaterial provider写入
        pydatasheet.write(line, 7, sampletyple)  # 第8列sampletype写入
        pydatasheet.write(line, 8, treatment)  # 第9列treatm写入
        print("现在完成了第" + str(line + 1) + "行")
        time.sleep(0.5)
        pydata.save(excel_name)


if __name__ == "__main__":
    excel_name = '统计b502其他116个样本的breed.xls'
    get_beefinformation(excel_name)