简单爬虫实践
手里有了RUNID,需要收集这些测序生物样本的品种等相关信息。
记录于此,有待改善。
'''
@author:yanglv
2020年11月7日20:18:49
简单爬虫---根据表格中的runid,在NCBI爬取Biosample信息,并写入表格。
'''
import requests
import re
from bs4 import BeautifulSoup
import xlrd
from xlutils.copy import copy
import time
from retrying import retry
def open_excel(excel_name):
# 打开exce文档,读取SRR号码,得SRR列表
data = xlrd.open_workbook(excel_name)
sheet1 = data.sheet_by_name('Sheet1')
srrnumber = sheet1.col_values(0) # 第1列信息
i = jump(excel_name) + 1
srrnumber = srrnumber[i:]
return srrnumber
def qukongge(mylist=None):
get = mylist
newlist = []
for item in get:
if item != ' ':
newlist.append(item)
if newlist == []:
newlist = [' ']
return newlist
def jump(excel_name):
# 跳过已经写入的
data = xlrd.open_workbook(excel_name)
sheet = data.sheet_by_name('Sheet1')
ihad1 = sheet.col_values(1) # 读取breed列看进度
ihad2 = ihad1[1:]
ihad = []
for item in ihad2:
if item != '': # 去除空元素
ihad.append(item)
ihadnumber = len(ihad)
print("我已经有了的数据数为: " + str(ihadnumber))
return ihadnumber
@retry()
def get_beefinformation(excel_name):
# 主方法:查表excel_name中Sheet1列的品种名称。
srrhaoma = open_excel(excel_name)
line = jump(excel_name)
for srr in srrhaoma:
print(srr)
url = 'https://www.ncbi.nlm.nih.gov/sra/?term=' + srr
print("现在进行的网页为" + url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
response = requests.get(url, headers)
htmlfile = BeautifulSoup(response.text, 'lxml')
sample = htmlfile.select('#ResultView > div:nth-child(4) > span > div > a:nth-child(1)')
if sample == []:
sample = htmlfile.select('#ResultView > div:nth-child(3) > span > div > a:nth-child(1)')
print(str(sample))
result1 = str(sample[0])
link = re.findall('/\w+\/\w+', result1)
link1 = link[0]
print(link1)
sampleID = re.findall('SAMN\d+', result1)
sampleID = sampleID[0]
print(sampleID)
time.sleep(1)
url = 'https://www.ncbi.nlm.nih.gov' + link1
print("生物样本网页" + url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
response1 = requests.get(url, headers)
htmlfile = BeautifulSoup(response1.text, 'lxml')
mydata = htmlfile.select('#maincontent > div > div:nth-child(5) > div > div.docsum > dl:nth-child(4) > dd')
if mydata == []:
mydata = htmlfile.select('#maincontent > div > div:nth-child(5) > div > div.docsum > dl:nth-child(3) > dd')
print("-----生物样本数据------")
print(mydata)
mydata = str(mydata[0])
sampleid = re.findall('SAMN\d+', sampleID)
breed2 = re.findall('breed</th><td>.{0,15}\s</td>',mydata)
breed1 = qukongge(breed2)
breed = re.sub('breed</th><td>|</td>', '', breed1[0])
print("breed值" + breed)
age2 = re.findall('age</th><td>.{0,15}\s</td>|\s', mydata)
age1 = qukongge(age2)
age = re.sub('age</th><td>|\s</td>|\s', '', age1[0])
print("age值" + age)
sex2 = re.findall('sex</th><td>.{0,15}\s</td>|\s', mydata)
sex1 = qukongge(sex2)
sex = re.sub('sex</th><td>|\s</td>|\s', '', sex1[0])
print("sex值" + sex)
tissue2 = re.findall('tissue</th><td>.{0,15}</td>|\s', mydata)
tissue1 = qukongge(tissue2)
tissue = re.sub('tissue</th><td>|\s</td>|\s', '', tissue1[0])
print("tissue值" + tissue)
biomaterrial2 = re.findall('biomaterial\sprovider</th><td>.{0,15}\s</td>|\s', mydata)
biomaterrial1 = qukongge(biomaterrial2)
biomaterrial = re.sub('biomaterial\sprovider</th><td>|\s</td>|\s', '', biomaterrial1[0])
print("biomaterial值" + biomaterrial)
sampletyple2 = re.findall('sample\stype</th><td>.{0,15}\s</td>|\s', mydata)
sampletyple1 = qukongge(sampletyple2)
sampletyple = re.sub('sample\stype</th><td>|\s</td>|\s', '', sampletyple1[0])
treatment2 = re.findall('treatment</th><td>.{0,15}\s</td>|\s', mydata)
treatment1 = qukongge(treatment2)
treatment = re.sub('treatment</th><td>|\s</td>|\s', '', treatment1[0])
# 写入
data = xlrd.open_workbook(excel_name)
pydata = copy(data) # copy方法进入python
pydatasheet = pydata.get_sheet(0)
line = line + 1
pydatasheet.write(line, 1, breed)
pydatasheet.write(line, 2, sampleid) # 第三列,sampleid写入
pydatasheet.write(line, 3, age) # 第四列,age写入
pydatasheet.write(line, 4, sex) # 第5列sex写入
pydatasheet.write(line, 5, tissue) # 第6列tissue写入
pydatasheet.write(line, 6, biomaterrial) # 第7列biomaterial provider写入
pydatasheet.write(line, 7, sampletyple) # 第8列sampletype写入
pydatasheet.write(line, 8, treatment) # 第9列treatm写入
print("现在完成了第" + str(line + 1) + "行")
time.sleep(0.5)
pydata.save(excel_name)
if __name__ == "__main__":
excel_name = '统计b502其他116个样本的breed.xls'
get_beefinformation(excel_name)