python多线程爬取CKB数据库
python多线程爬取CKB数据库
简介
本文针对ckb数据库里的Gene Variant Detail Transcript 信息进行爬取:
1、进入网站https://ckb.jax.org/gene/grid,获取gene的名称及其相对应的链接。只有这些蓝色信息可以爬取。
2、点开链接后,获取其Variant的名称及其相应的链接。
3、点开链接后,将其对应的Transcript信息爬取下来。
本代码,会将Gene Variant Detail Transcript 信息 print 出来。由于数量太多,导致等待时间太长,最后又添加了线程池以提升爬取速度,如果你电脑扛得住可以将线程数加大。
import requests
import bs4
from bs4 import BeautifulSoup
import threadpool
def getHTMLText(url):
try:
r = requests.get(url,timeout = 40)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def get_gene_id(url):
html = getHTMLText(url)
soup = BeautifulSoup(html,'html.parser')
gene_id_dict = {}
for a in soup.find_all(name = "a",attrs = "btn btn-default btn-gene btn-block"):
gene_name = a.string.replace("\n","").replace(" ","")
ID = a.attrs['href']
gene_id = "https://ckb.jax.org" + ID
gene_id_dict[gene_name] = gene_id
return gene_id_dict
def gene_variant_link(gene,url):
list_link = []
url = url
html = getHTMLText(url)
soup = BeautifulSoup(html,'html.parser')
for a in soup.select('a[href^="/geneVariant"]'):
#print(gene, '\t', a.text.replace(" ","").replace("\n",""),'\t','https://ckb.jax.org'+a['href'])
list_link.append([gene, a.text.replace(" ","").replace("\n",""), 'https://ckb.jax.org'+ a['href']])
return list_link
def print_gene_variant(aaa):
url = aaa[-1]
html = getHTMLText(url)
soup = BeautifulSoup(html,'html.parser')
list_aa = []
if len(soup.find_all("table",attrs={"id":"TranscriptTabTable"})) == 2:
for tr in soup.find_all("table",attrs={"id":"TranscriptTabTable"})[1].children:
if isinstance(tr,bs4.element.Tag):
print('\t'.join(aaa), '\t', tr.text.replace(' ','').replace('\n\n','').strip().replace('\n','\t'))
list_aa.append(tr.text.replace(' ','').replace('\n\n','').strip().replace('\n','\t'))
if len(soup.find_all("table",attrs={"id":"TranscriptTabTable"})) == 1:
for tr in soup.find_all("table",attrs={"id":"TranscriptTabTable"})[0].children:
if isinstance(tr,bs4.element.Tag):
print('\t'.join(aaa), '\t', tr.text.replace(' ','').replace('\n\n','').strip().replace('\n','\t'))
list_aa.append(tr.text.replace(' ','').replace('\n\n','').strip().replace('\n','\t'))
if len(soup.find_all("table",attrs={"id":"TranscriptTabTable"})) == 0:
print('\t'.join(aaa), '\t', 'NAN')
url = "https://ckb.jax.org/gene/grid"
gDict = get_gene_id(url)
for name,ID in gDict.items():
#print(name,ID)
ainfo = []
url = ID
gene = name
list_link = gene_variant_link(gene, url)
pool = threadpool.ThreadPool(10)
tasks = threadpool.makeRequests(print_gene_variant, list_link)
[pool.putRequest(task) for task in tasks]
pool.wait()
参考
本代码前半部分参考另一文章,其可以将Gene Variant的list输出出来,也就本文提到的第2张图。
https://www.jianshu.com/p/1d37a4f2f7ed