python爬虫 多线程爬取NCBI的数据信息
下面以爬取小鼠单细胞数据集为例子,一共6404页(selenium版本为3.141.0)
# -*- coding: utf-8 -*-
"""
@Time : 2023/7/9 19:04
@Auth : victor
@IDE :PyCharm
"""
from lxml import etree
from selenium import webdriver
from multiprocessing.dummy import Pool
from functools import partial
import os
import time
import requests
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions
def setoption():
"""
谷歌浏览器常规反反爬的参数设置
"""
chrome_options = Options()
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
option = ChromeOptions()
option.add_experimental_option("excludeSwitches", ["enable-automation"])
return chrome_options, option
def getonepage(pagetext, file_index):
tree = etree.HTML(pagetext)
initurl = "https://www.ncbi.nlm.nih.gov"
div_list = tree.xpath('//*[@id="maincontent"]/div/div[5]/div[@class="rprt"]')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
for div in div_list:
detail_url = initurl + div.xpath('.//p/a/@href')[0]
response = requests.get(detail_url, headers=headers)
response = response.content.decode()
detail_html = bytes(bytearray(response, encoding='utf-8'))
detail_tree = etree.HTML(detail_html)
summary = detail_tree.cssselect(
'div.expand.showed.sra-full-data > div > div, div.expand.showed.sra-full-data > div > div > span')
summary = [element.text for element in summary]
summary = " ".join(summary)
study = detail_tree.cssselect('#ResultView > div:nth-child(3)')
study = [element.text for element in study]
title = detail_tree.cssselect('#ResultView > div:nth-child(3) > span')
title = [element.text for element in title]
if study == [] or title ==[]:
res = f"{summary}".replace("\n", "") + "\n"
else:
res = f"{study[0]}\t{title[0]}\t{summary}".replace("\n", "") + "\n"
filename = f"data_{file_index}.txt"
with open(filename, 'a', encoding='utf-8') as f:
f.write(res)
def mainprocess(chrome_options, option, executable_path, thread=4):
bro = webdriver.Chrome(options=chrome_options, chrome_options=option, executable_path=executable_path)
bro.get(
"https://www.ncbi.nlm.nih.gov/sra?term=((((%22rna%20seq%22%5BStrategy%5D)%20AND%20%22transcriptomic%20single%20cell%22%5BSource%5D))%20AND%20%22Mus%20musculus%22%5Borgn%3A__txid10090%5D)")
pagetext = bro.page_source
allpagetree = etree.HTML(pagetext)
allpage = int(allpagetree.xpath('//*[@id="pageno2"]/@last')[0])
batch_size = 60
total_pages = allpage
total_batches = total_pages // batch_size
remaining_pages = total_pages % batch_size
start_page = 1
end_page = start_page + batch_size - 1
for batch in range(1, total_batches + 1):
pagetext_list = []
for page in range(start_page, end_page + 1):
page_input = bro.find_element_by_id('pageno2')
page_input.clear()
page_input.send_keys(page)
page_input.send_keys(Keys.ENTER)
pagetext = bro.page_source
print(f"Append page {page} to the queue.")
pagetext_list.append(pagetext)
pool = Pool(thread)
pool.map(partial(getonepage, file_index=batch), pagetext_list)
pool.close()
pool.join()
print(f"Saved batch {batch}")
start_page += batch_size
end_page += batch_size
if remaining_pages > 0:
pagetext_list = []
for page in range(start_page, start_page + remaining_pages):
page_input = bro.find_element_by_id('pageno2')
page_input.clear()
page_input.send_keys(page)
page_input.send_keys(Keys.ENTER)
pagetext = bro.page_source
print(f"Append page {page} to the queue.")
pagetext_list.append(pagetext)
pool = Pool(thread)
pool.map(partial(getonepage, file_index=total_batches + 1), pagetext_list)
pool.close()
pool.join()
print(f"Saved remaining pages")
bro.quit()
if __name__ == "__main__":
chrome_options, option = setoption()
mainprocess(chrome_options, option, r"chromedriver.exe", 4)