问题
有两种关键字,需要在ncbi上面进行查询,返回bioProjectID
查询完发现网站地址如下:这就是我们需要请求的页面
https://www.ncbi.nlm.nih.gov/bioproject/?term=(frontal%5BAll+Fields%5D+AND+ASD%5BAll+Fields%5D)+AND+%22org+human%22%5BFilter%5D
点击查看此时的网页
1 处是我们的筛选条件
2 处是我们需要返回的bioProjectID
请求地址是: 'https://www.ncbi.nlm.nih.gov/bioproject/?term=(%s[All Fields] AND %s[All Fields]) AND "org human"[Filter]' % ( d, b)
d b 分别是需要查询的两个关键字:frontal,ASD
返回的结果就是和这两个关键字相关的bioProjectID 条目
爬虫生成的文件:
2ASD_brainArea_bioProjectID.tsv 关键字1,关键字2,bioprojectID
文件 3bioProjectID_url.tsv
给出这个id的详情页面,方便在excel中点击跳转
爬虫源码
只需要把index文件替换为自己需要查询的关键字就行,n_k 表示第一种关键字的个数
如这个index文件: 表示前三个是一第一种关键字,n_k = 3
# Title : getBioProjectID.py
# Created by: julse@qq.com
# Created on: 2021/8/6 8:51
# des : TODO
# ((ASD) AND "Homo sapiens"[orgn:__txid9606]) AND bioproject_sra[filter] NOT bioproject_gap[filter]
# (frontal[All Fields] AND ASD[All Fields]) AND "org human"[Filter]
# https://www.ncbi.nlm.nih.gov/bioproject/?term=(frontal%5BAll+Fields%5D+AND+ASD%5BAll+Fields%5D)+AND+%22org+human%22%5BFilter%5D
# #maincontent > div > div:nth-child(5) > div:nth-child(1) > div.rslt > div.aux > div > dl > dl
import time
from scrapy import Selector
import requests
import pandas as pd
def queryBP(URL):
# 引入 requests,实现请求
# URL = 'https://www.ncbi.nlm.nih.gov/bioproject/?term=(frontal%5BAll+Fields%5D+AND+ASD%5BAll+Fields%5D)+AND+%22org+human%22%5BFilter%5D'
# URL = 'https://www.ncbi.nlm.nih.gov/bioproject/?term=(frontal%5BAll+Fields%5D+AND+ASD%5BAll+Fields%5D)+AND+%22org+human%22%5BFilter%5D'
# 输入在浏览器的网址
res = requests.get(URL)
# 发送 GET 方式的请求,并把返回的结果(响应)存储在 res 变量里头
# 答第二个问题,get() 方法需要输入一个网页链接
body = res.text
selector = Selector(text=body)
# result = selector.css('#maincontent > div > div > div > div.rslt > div.aux > div > dl > dl')
# 展示多行结果
se = selector.css('#maincontent > div > div > div > div.rslt > div.aux > div > dl > dl > dd:nth-child(4)')
idlist = se.re('\d+')
if idlist==[]:# 展示单一条目
se = selector.css('#maincontent > div > div:nth-child(5) > div > div.Right > span')
idlist = se.re('\d+')
return idlist
# maincontent > div > div:nth-child(5) > div > div.Right > span
def queryAndSave(f1_in,f1,f2,n_k):
'''
:param f1_in: query key
:param f1: query info
:param f2: result
:return:
'''
# b = 'autism'
# d = 'Thalamus'
# # autism Primary Motor
# URL = 'https://www.ncbi.nlm.nih.gov/bioproject/?term=(%s[All Fields] AND %s[All Fields]) AND "org human"[Filter] AND "org human"[Filter]' % (
# d, b)
# ans = queryBP(URL)
df = pd.read_csv(f1_in, header=None)
bpid = []
keylist = df[0].values
disease = keylist[0:n_k]
brainA = keylist[n_k:]
with open(f1, 'w') as fi:
fi.write('ASD\tbrainArea\tbioProjectID\n')
fi.flush()
for d in disease:
for b in brainA:
print(d, b)
URL = 'https://www.ncbi.nlm.nih.gov/bioproject/?term=(%s[All Fields] AND %s[All Fields]) AND "org human"[Filter]' % (
d, b)
items = queryBP(URL)
if len(items) == 0:
fi.write('%s\t%s\t%s\n' % (d, b, 'notfound'))
fi.flush()
for item in queryBP(URL):
fi.write('%s\t%s\t%s\n' % (d, b, item))
fi.flush()
bpid.append(item)
df1 = pd.DataFrame(bpid).drop_duplicates()
df1.to_csv(f2, header=['bioProjectID'], index=None)
if __name__ == '__main__':
print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
start = time.time()
f1_in = 'index.txt'
f1 = '3ASD_brainArea_bioProjectID.tsv'
f2 = '3bioProjectID.tsv'
f3 = '3bioProjectID_url.tsv'
n_k = 3
queryAndSave(f1_in, f1, f2,n_k)
# stop 2021-08-06 09:49:30
# time 915.471342086792
# stop 2021-08-06 10:39:18
# time 1772.6134023666382
df = pd.read_csv(f2)
df[1] = df.applymap(lambda x:'https://www.ncbi.nlm.nih.gov/bioproject/%s'%x)
df.to_csv(f3, header=['bioProjectID','url'], index=None)
print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('time', time.time() - start)