一、配置python环境
详见百度
二、Chromdriver安装
Chromdriver下载地址:
http://chromedriver.storage.googleapis.com/index.html
http://npm.taobao.org/mirrors/chromedriver/
需要下载与本机谷歌浏览器版本一致,具体配置百度。
三、安装软件Pycharm
官网下载即可。
四、爬虫
代码如下:
# 通过GSM号搜索ncbi中的Run,SRX细节页信息
# GSM号表中以GSM为列名,以GSM号为列内容
import requests #导入requests包
import pandas as pd
import re
from bs4 import BeautifulSoup
url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
# strhtml = requests.get(url) #Get方式获取网页数据
# print(strhtml.text)
df = pd.read_csv("C://Users//admin//Desktop//pachong//GSE195956//srr.csv", encoding='utf-8')
row = df['gsm']
# 列表字段
GSE = ""
GSMID = ""
GSMName = ""
PMID = ""
Characteristic = ""
Source_name = ""
Organism = ""
SRR = ""
Instrument = ""
Strategy = ""
Source = ""
Selection = ""
Layout = ""
for gsm in row:
gsmurl = url+gsm
print(gsmurl)
strhtml = requests.get(gsmurl)
sampleDetailSoup = BeautifulSoup(strhtml.text, 'lxml')
detailHtml = sampleDetailSoup.prettify()
# print(detailHtml)
titlename=""
sampleTitle = re.compile('<tr valign="top">.*?<td nowrap="">.*?Title.*?</td>.*?<td style="text-align: justify">(.*?)</td>',re.S)
titlenameArr = re.findall(sampleTitle, detailHtml)
if titlenameArr:
titlename = titlenameArr[0].strip()
print(titlename)
# 匹配第三层的Source name;Organism;Characteristics
pattern3_1 = re.compile('<tr valign="top">.*?<td nowrap="">.*?Source name.*?</td>.*?<td style="text-align: justify">(.*?)<br/>', re.S)
pattern3_2 = re.compile('<tr valign="top">.*?<td nowrap="">.*?Organism.*?</td>.*?<td>.*?<a href=.*?>(.*?)</a>',re.S)
pattern3_3 = re.compile('<tr valign="top">.*?<td nowrap="">.*?Characteristics.*?</td>.*?<td style="text-align: justify">(.*?)</td>',re.S)
Source_nameArr = re.findall(pattern3_1, detailHtml)
OrganismArr = re.findall(pattern3_2, detailHtml)
Characteristics = re.findall(pattern3_3, detailHtml)
if Source_nameArr:
Source_name = Source_nameArr[0].strip()
if OrganismArr:
Organism = OrganismArr[0].strip()
Characteristic = ''
if Characteristics:
Characteristics = Characteristics[0].strip().replace('\n', '').split('<br/>')
for chara in range(len(Characteristics)):
if chara >= 4:
break
Characteristic = Characteristic + Characteristics[chara].strip() + "|"
Characteristic = Characteristic[:-1]
# GSM 输出 ......
with open('C://Users//admin//Desktop//pachong//GSE195956//re1.txt', 'a', encoding='utf-8') as f:
f.write(gsm + '\t' + titlename + '\t' + Source_name + '\t' + Organism + '\t' + Characteristic + '\n')
f.close()
# GSM end ......
# srx页面,需要时取消注释
pattern3_4 = re.compile('<tr\svalign="top">.*?<td>.*?SRA.*?</td>.*?<td>.*?<a\shref=".*?">(.*?)</a>.*?</td>.*?</tr>',re.S)
print(pattern3_4)
itemsSRX = re.findall(pattern3_4, detailHtml)
print(itemsSRX)
if itemsSRX:
print("ddddddd")
for itemSRX in itemsSRX:
if itemSRX.strip()[0:3] == 'SRX':
url3 = 'https://www.ncbi.nlm.nih.gov/sra?term=' + itemSRX.strip()
d = 0
while d < 5:
try:
responseSRX = requests.get(url3, timeout=60)
soupSRX = BeautifulSoup(responseSRX.text, 'lxml')
htmSRX = soupSRX.prettify()
patternSRR = re.compile('<a\shref=".*?run=SRR.*?">\s(.*?)\s+</a>', re.S)
itemsSRR = re.findall(patternSRR, htmSRX)
for isrr in range(len(itemsSRR)):
itemsSRR[isrr] = itemsSRR[isrr].replace(' ', '')
SRR = '|'.join(itemsSRR)
instrumentPattern = re.compile('<div>.*?Instrument:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
StrategyPattern = re.compile('<div>.*?Strategy:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
SourcePattern = re.compile('<div>.*?Source:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
SelectionPattern = re.compile('<div>.*?Selection:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
LayoutPattern = re.compile('<div>.*?Layout:.*?<span>\s(.*?)\s</span>.*?</div>', re.S)
InstrumentArr = re.findall(instrumentPattern, htmSRX)
StrategyArr = re.findall(StrategyPattern, htmSRX)
SourceArr = re.findall(SourcePattern, htmSRX)
SelectionArr = re.findall(SelectionPattern, htmSRX)
LayoutArr = re.findall(LayoutPattern, htmSRX)
if InstrumentArr:
Instrument = InstrumentArr[0].strip()
else:
Instrument = ''
if StrategyArr:
Strategy = StrategyArr[0].strip()
else:
Strategy = ''
if SourceArr:
Source = SourceArr[0].strip()
else:
Source = ''
if SelectionArr:
Selection = SelectionArr[0].strip()
else:
Selection = ''
if LayoutArr:
Layout = LayoutArr[0].strip()
else:
Layout = ''
with open('C://Users//admin//Desktop//pachong//GSE195956//re2.txt', 'a', encoding='utf-8') as f:
f.write(gsm + '\t' + titlename + '\t' + SRR + '\t' + Instrument + '\t' + Strategy + '\t' + Source + '\t' + Selection + '\t' + Layout + '\n')
f.close()
break
except requests.exceptions.RequestException:
print('报错了正在重试:')
d += 1
运行过程会报错,是因为缺少python包,根据报错cmd安装即可
pip install xxxxx
srr.csv文件内容为收集的GSM号,r1.txt和r2.txt均为输出。
爬虫的具体内容可以根据自己的需求修改代码,也就是代码中<div>字样的内容。