python爬虫（生物信息）

bio_victor

已于 2023-07-11 16:04:44 修改

阅读量1.4k

点赞数 2

文章标签： python 爬虫

于 2023-07-10 20:13:10 首次发布

本文链接：https://blog.csdn.net/qq_34375919/article/details/131646831

版权

python爬虫多线程爬取NCBI的数据信息

下面以爬取小鼠单细胞数据集为例子，一共6404页(selenium版本为3.141.0)

# -*- coding: utf-8 -*-
"""
@Time ： 2023/7/9 19:04
@Auth ： victor
@IDE ：PyCharm
"""
from lxml import etree
from selenium import webdriver
from multiprocessing.dummy import Pool
from functools import partial
import os
import time
import requests
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions



def setoption():
    """
    谷歌浏览器常规反反爬的参数设置
    """
    chrome_options = Options()
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")

    option = ChromeOptions()
    option.add_experimental_option("excludeSwitches", ["enable-automation"])

    return chrome_options, option


def getonepage(pagetext, file_index):
    tree = etree.HTML(pagetext)
    initurl = "https://www.ncbi.nlm.nih.gov"
    div_list = tree.xpath('//*[@id="maincontent"]/div/div[5]/div[@class="rprt"]')
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    for div in div_list:
        detail_url = initurl + div.xpath('.//p/a/@href')[0]
        response = requests.get(detail_url, headers=headers)
        response = response.content.decode()
        detail_html = bytes(bytearray(response, encoding='utf-8'))
        detail_tree = etree.HTML(detail_html)

        summary = detail_tree.cssselect(
            'div.expand.showed.sra-full-data > div > div, div.expand.showed.sra-full-data > div > div > span')
        summary = [element.text for element in summary]
        summary = " ".join(summary)

        study = detail_tree.cssselect('#ResultView > div:nth-child(3)')
        study = [element.text for element in study]

        title = detail_tree.cssselect('#ResultView > div:nth-child(3) > span')
        title = [element.text for element in title]

        if study == [] or title ==[]:
            res = f"{summary}".replace("\n", "") + "\n"
        else:
            res = f"{study[0]}\t{title[0]}\t{summary}".replace("\n", "") + "\n"

        filename = f"data_{file_index}.txt"
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(res)

def mainprocess(chrome_options, option, executable_path, thread=4):
    bro = webdriver.Chrome(options=chrome_options, chrome_options=option, executable_path=executable_path)

    bro.get(
        "https://www.ncbi.nlm.nih.gov/sra?term=((((%22rna%20seq%22%5BStrategy%5D)%20AND%20%22transcriptomic%20single%20cell%22%5BSource%5D))%20AND%20%22Mus%20musculus%22%5Borgn%3A__txid10090%5D)")

   
    pagetext = bro.page_source
    allpagetree = etree.HTML(pagetext)
    allpage = int(allpagetree.xpath('//*[@id="pageno2"]/@last')[0])

    batch_size = 60
    total_pages = allpage
    total_batches = total_pages // batch_size
    remaining_pages = total_pages % batch_size

    start_page = 1
    end_page = start_page + batch_size - 1

    for batch in range(1, total_batches + 1):
        pagetext_list = []
        for page in range(start_page, end_page + 1):
            page_input = bro.find_element_by_id('pageno2')
            page_input.clear()
            page_input.send_keys(page)
            page_input.send_keys(Keys.ENTER)
            pagetext = bro.page_source
            print(f"Append page {page} to the queue.")
            pagetext_list.append(pagetext)

        pool = Pool(thread)
        pool.map(partial(getonepage, file_index=batch), pagetext_list)
        pool.close()
        pool.join()

        print(f"Saved batch {batch}")

        start_page += batch_size
        end_page += batch_size

    if remaining_pages > 0:
        pagetext_list = []
        for page in range(start_page, start_page + remaining_pages):
            page_input = bro.find_element_by_id('pageno2')
            page_input.clear()
            page_input.send_keys(page)
            page_input.send_keys(Keys.ENTER)
            pagetext = bro.page_source
            print(f"Append page {page} to the queue.")
            pagetext_list.append(pagetext)

        pool = Pool(thread)
        pool.map(partial(getonepage, file_index=total_batches + 1), pagetext_list)
        pool.close()
        pool.join()

        print(f"Saved remaining pages")

    bro.quit()


if __name__ == "__main__":
    chrome_options, option = setoption()
    mainprocess(chrome_options, option, r"chromedriver.exe", 4)