通过爬虫在GEO数据库上获取对应SRR号

最新推荐文章于 2023-10-06 17:36:23 发布

bio_yys

最新推荐文章于 2023-10-06 17:36:23 发布

阅读量841

点赞数 1

分类专栏：生物信息文章标签： python

本文链接：https://blog.csdn.net/yangyongsan/article/details/126977674

版权

生物信息专栏收录该内容

4 篇文章 1 订阅

订阅专栏

一、配置python环境

详见百度

二、Chromdriver安装

Chromdriver下载地址：

http://chromedriver.storage.googleapis.com/index.html

http://npm.taobao.org/mirrors/chromedriver/

需要下载与本机谷歌浏览器版本一致，具体配置百度。

三、安装软件Pycharm

官网下载即可。

四、爬虫

代码如下：

# 通过GSM号搜索ncbi中的Run，SRX细节页信息
# GSM号表中以GSM为列名，以GSM号为列内容
import requests        #导入requests包
import pandas as pd
import re
from bs4 import BeautifulSoup
url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
# strhtml = requests.get(url)        #Get方式获取网页数据
# print(strhtml.text)

df = pd.read_csv("C://Users//admin//Desktop//pachong//GSE195956//srr.csv", encoding='utf-8')
row = df['gsm']
# 列表字段
GSE = ""
GSMID = ""
GSMName = ""
PMID = ""
Characteristic = ""
Source_name = ""
Organism = ""
SRR = ""
Instrument = ""
Strategy = ""
Source = ""
Selection = ""
Layout = ""
for gsm in row:
    gsmurl = url+gsm
    print(gsmurl)
    strhtml = requests.get(gsmurl)
    sampleDetailSoup = BeautifulSoup(strhtml.text, 'lxml')
    detailHtml = sampleDetailSoup.prettify()
    # print(detailHtml)
    titlename=""
    sampleTitle = re.compile('<tr valign="top">.*?<td nowrap="">.*?Title.*?</td>.*?<td style="text-align: justify">(.*?)</td>',re.S)
    titlenameArr = re.findall(sampleTitle, detailHtml)
    if titlenameArr:
        titlename = titlenameArr[0].strip()
    print(titlename)
    # 匹配第三层的Source name；Organism；Characteristics
    pattern3_1 = re.compile('<tr valign="top">.*?<td nowrap="">.*?Source name.*?</td>.*?<td style="text-align: justify">(.*?)<br/>', re.S)
    pattern3_2 = re.compile('<tr valign="top">.*?<td nowrap="">.*?Organism.*?</td>.*?<td>.*?<a href=.*?>(.*?)</a>',re.S)
    pattern3_3 = re.compile('<tr valign="top">.*?<td nowrap="">.*?Characteristics.*?</td>.*?<td style="text-align: justify">(.*?)</td>',re.S)
    Source_nameArr = re.findall(pattern3_1, detailHtml)
    OrganismArr = re.findall(pattern3_2, detailHtml)
    Characteristics = re.findall(pattern3_3, detailHtml)
    if Source_nameArr:
        Source_name = Source_nameArr[0].strip()
    if OrganismArr:
        Organism = OrganismArr[0].strip()
    Characteristic = ''
    if Characteristics:
        Characteristics = Characteristics[0].strip().replace('\n', '').split('<br/>')
        for chara in range(len(Characteristics)):
            if chara >= 4:
                break
            Characteristic = Characteristic + Characteristics[chara].strip() + "|"
        Characteristic = Characteristic[:-1]
    # GSM 输出 ......
    with open('C://Users//admin//Desktop//pachong//GSE195956//re1.txt', 'a', encoding='utf-8') as f:
        f.write(gsm + '\t' + titlename + '\t' + Source_name + '\t' + Organism + '\t' + Characteristic + '\n')
        f.close()
    # GSM end ......
    # srx页面，需要时取消注释

    pattern3_4 = re.compile('<tr\svalign="top">.*?<td>.*?SRA.*?</td>.*?<td>.*?<a\shref=".*?">(.*?)</a>.*?</td>.*?</tr>',re.S)
    print(pattern3_4)
    itemsSRX = re.findall(pattern3_4, detailHtml)
    print(itemsSRX)
    if itemsSRX:
        print("ddddddd")
        for itemSRX in itemsSRX:
            if itemSRX.strip()[0:3] == 'SRX':
                url3 = 'https://www.ncbi.nlm.nih.gov/sra?term=' + itemSRX.strip()
                d = 0
                while d < 5:
                    try:
                        responseSRX = requests.get(url3, timeout=60)
                        soupSRX = BeautifulSoup(responseSRX.text, 'lxml')
                        htmSRX = soupSRX.prettify()
                        patternSRR = re.compile('<a\shref=".*?run=SRR.*?">\s(.*?)\s+</a>', re.S)
                        itemsSRR = re.findall(patternSRR, htmSRX)
                        for isrr in range(len(itemsSRR)):
                            itemsSRR[isrr] = itemsSRR[isrr].replace(' ', '')
                        SRR = '|'.join(itemsSRR)
                        instrumentPattern = re.compile('<div>.*?Instrument:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
                        StrategyPattern = re.compile('<div>.*?Strategy:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
                        SourcePattern = re.compile('<div>.*?Source:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
                        SelectionPattern = re.compile('<div>.*?Selection:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
                        LayoutPattern = re.compile('<div>.*?Layout:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
                        InstrumentArr = re.findall(instrumentPattern, htmSRX)
                        StrategyArr = re.findall(StrategyPattern, htmSRX)
                        SourceArr = re.findall(SourcePattern, htmSRX)
                        SelectionArr = re.findall(SelectionPattern, htmSRX)
                        LayoutArr = re.findall(LayoutPattern, htmSRX)
                        if InstrumentArr:
                            Instrument = InstrumentArr[0].strip()
                        else:
                            Instrument = ''
                        if StrategyArr:
                            Strategy = StrategyArr[0].strip()
                        else:
                            Strategy = ''
                        if SourceArr:
                            Source = SourceArr[0].strip()
                        else:
                            Source = ''
                        if SelectionArr:
                            Selection = SelectionArr[0].strip()
                        else:
                            Selection = ''
                        if LayoutArr:
                            Layout = LayoutArr[0].strip()
                        else:
                            Layout = ''
                        with open('C://Users//admin//Desktop//pachong//GSE195956//re2.txt', 'a', encoding='utf-8') as f:
                            f.write(gsm + '\t' + titlename + '\t' + SRR + '\t' + Instrument + '\t' + Strategy + '\t' + Source + '\t' + Selection + '\t' + Layout + '\n')
                            f.close()
                        break
                    except requests.exceptions.RequestException:
                        print('报错了正在重试:')
                        d += 1

运行过程会报错，是因为缺少python包，根据报错cmd安装即可