python爬虫(生物信息)

python爬虫 多线程爬取NCBI的数据信息

下面以爬取小鼠单细胞数据集为例子,一共6404页(selenium版本为3.141.0)

# -*- coding: utf-8 -*-
"""
@Time : 2023/7/9 19:04
@Auth : victor
@IDE :PyCharm
"""
from lxml import etree
from selenium import webdriver
from multiprocessing.dummy import Pool
from functools import partial
import os
import time
import requests
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions



def setoption():
    """
    谷歌浏览器常规反反爬的参数设置
    """
    chrome_options = Options()
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")

    option = ChromeOptions()
    option.add_experimental_option("excludeSwitches", ["enable-automation"])

    return chrome_options, option


def getonepage(pagetext, file_index):
    tree = etree.HTML(pagetext)
    initurl = "https://www.ncbi.nlm.nih.gov"
    div_list = tree.xpath('//*[@id="maincontent"]/div/div[5]/div[@class="rprt"]')
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    for div in div_list:
        detail_url = initurl + div.xpath('.//p/a/@href')[0]
        response = requests.get(detail_url, headers=headers)
        response = response.content.decode()
        detail_html = bytes(bytearray(response, encoding='utf-8'))
        detail_tree = etree.HTML(detail_html)

        summary = detail_tree.cssselect(
            'div.expand.showed.sra-full-data > div > div, div.expand.showed.sra-full-data > div > div > span')
        summary = [element.text for element in summary]
        summary = " ".join(summary)

        study = detail_tree.cssselect('#ResultView > div:nth-child(3)')
        study = [element.text for element in study]

        title = detail_tree.cssselect('#ResultView > div:nth-child(3) > span')
        title = [element.text for element in title]

        if study == [] or title ==[]:
            res = f"{summary}".replace("\n", "") + "\n"
        else:
            res = f"{study[0]}\t{title[0]}\t{summary}".replace("\n", "") + "\n"

        filename = f"data_{file_index}.txt"
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(res)

def mainprocess(chrome_options, option, executable_path, thread=4):
    bro = webdriver.Chrome(options=chrome_options, chrome_options=option, executable_path=executable_path)

    bro.get(
        "https://www.ncbi.nlm.nih.gov/sra?term=((((%22rna%20seq%22%5BStrategy%5D)%20AND%20%22transcriptomic%20single%20cell%22%5BSource%5D))%20AND%20%22Mus%20musculus%22%5Borgn%3A__txid10090%5D)")

   
    pagetext = bro.page_source
    allpagetree = etree.HTML(pagetext)
    allpage = int(allpagetree.xpath('//*[@id="pageno2"]/@last')[0])

    batch_size = 60
    total_pages = allpage
    total_batches = total_pages // batch_size
    remaining_pages = total_pages % batch_size

    start_page = 1
    end_page = start_page + batch_size - 1

    for batch in range(1, total_batches + 1):
        pagetext_list = []
        for page in range(start_page, end_page + 1):
            page_input = bro.find_element_by_id('pageno2')
            page_input.clear()
            page_input.send_keys(page)
            page_input.send_keys(Keys.ENTER)
            pagetext = bro.page_source
            print(f"Append page {page} to the queue.")
            pagetext_list.append(pagetext)

        pool = Pool(thread)
        pool.map(partial(getonepage, file_index=batch), pagetext_list)
        pool.close()
        pool.join()

        print(f"Saved batch {batch}")

        start_page += batch_size
        end_page += batch_size

    if remaining_pages > 0:
        pagetext_list = []
        for page in range(start_page, start_page + remaining_pages):
            page_input = bro.find_element_by_id('pageno2')
            page_input.clear()
            page_input.send_keys(page)
            page_input.send_keys(Keys.ENTER)
            pagetext = bro.page_source
            print(f"Append page {page} to the queue.")
            pagetext_list.append(pagetext)

        pool = Pool(thread)
        pool.map(partial(getonepage, file_index=total_batches + 1), pagetext_list)
        pool.close()
        pool.join()

        print(f"Saved remaining pages")

    bro.quit()


if __name__ == "__main__":
    chrome_options, option = setoption()
    mainprocess(chrome_options, option, r"chromedriver.exe", 4)

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值