使用selenium批量爬取NCBI基因组注释数据_怎么从ncbi提取基因的注释信息-CSDN博客

本文链接：https://blog.csdn.net/Eumenidus/article/details/134336825

简介

gtf文件是一种基因组学中用于标记基因在基因组中位置并同时提供一些基本注释信息的文件，该文件中通常包含了可变剪切类型，外显子位置等信息，本文简单介绍如何使用python在给定物种名称的前提下自动下载gtf文件

由于文件所在页面使用了JavaScript来动态加载，因此需要使用selenium，本文对于selenium的使用方法不做额外介绍

代码

import os
import shutil
from time import sleep
import zipfile
from zipfile import BadZipFile

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import ElementClickInterceptedException,StaleElementReferenceException,NoSuchElementException


def download_ncbi_gtf(species,destination_path):
    web.get("https://www.ncbi.nlm.nih.gov/")
    homepage_selection=Select(web.find_element(by='id',value='database'))
    homepage_selection.select_by_value("genome") # located to 'genome'
    input_box=web.find_element(by="id",value='term')
    input_box.send_keys(species)
    sleep(0.2)
    search_button=web.find_element(by='id',value='search')
    search_button.click()
    if web.current_url.startswith("https://www.ncbi.nlm.nih.gov/search/all/?term="):
        search_result_a_tag=web.find_element(by='xpath',value='//*[@id="maincontent"]/div/div/div[1]/section/div[1]/div/div/div[1]/div/h4/a')
        search_result_a_tag.click()
    if web.current_url.startswith("https://www.ncbi.nlm.nih.gov/genome/?term="):
        search_result_a_tag=web.find_element(by='xpath',value='//*[@id="feat_title"]')
        search_result_a_tag.click()
    
    if web.current_url.startswith("https://www.ncbi.nlm.nih.gov/datasets/taxonomy/"):
        click_download=False
        while not click_download:
            sleep(2)
            button_list=web.find_elements(by=By.TAG_NAME,value='button')
            try:
                for button in button_list:
                    if button.text.strip("'")=='Download':
                        button.click()
                        click_download=True
            except StaleElementReferenceException:
                break
            except NoSuchElementException:
                break
        filesource_list=web.find_elements(by='name',value='sourcedb')    
        for button in filesource_list:
            if button.get_attribute('value')=='RefSeq':
                button.click()
                break
        sleep(1)
        refseq_gtf=web.find_element(by='name',value='GENOME_GTF')
        if refseq_gtf.is_enabled():
            refseq_gtf.click()
            web.find_element(by='name',value='GENOME_FASTA').click() # uncheck genome fasta checkbox
        else:
            for button in filesource_list:
                if button.get_attribute('value')=='GenBank':
                    button.click()
                    break
            sleep(1)
            web.find_element(by='name',value='GENOME_FASTA').click() # uncheck genome fasta checkbox
            genbank_gtf=web.find_element(by='name',value='GENOME_GTF')
            genbank_gtf.click()
        
        download_button_list=web.find_elements(by=By.TAG_NAME,value='button')
        for button in download_button_list:
            if button.get_attribute('data-testid')=='download-button':
                button.click()

        finish=False
        while not finish:
            # 判断是否完成下载，请注意目标文件夹中不能有其他zip后缀的文件
            sleep(1)
            for filename in os.listdir(destination_path):
                if filename.endswith(".zip"):
                    download_filename=filename
                    finish=True
        
        return download_filename.replace(".zip","")


def unzip_ncbi_file(species,zipfile_name,folder_path):
    zipped_file = zipfile.ZipFile(f"{folder_path}{zipfile_name}.zip",'r')
    for eachfile in zipped_file.namelist():
        zipped_file.extract(eachfile,folder_path)
    zipped_file.close()
    shutil.move(folder_path+f'ncbi_dataset\\data\\{zipfile_name}\\genomic.gtf',folder_path)
    os.rename(folder_path+'genomic.gtf',folder_path+species+'.gtf')
    os.remove(folder_path+'README.md')
    os.remove(folder_path+zipfile_name+'.zip')
    shutil.rmtree(folder_path+'ncbi_dataset')


if __name__=='__main__':
    destination_path="" # 此处为目标文件夹路径
    species='Homo sapiens' # 需要下载的物种拉丁名
    drive_path=os.getcwd()+'\\chromedriver.exe'
    # 此处将chromedrive放在与脚本同一目录下，可根据实际情况进行修改
    
    driver_service = Service(executable_path=drive_path)
    chrome_options = Options()
    chrome_options.add_experimental_option("excludeSwitches",["enable-logging"])
    chrome_options.add_experimental_option("prefs", {"download.default_directory": destination_path})
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument('--disable-gpu')
    web=Chrome(service=driver_service,options=chrome_options)

    try:
        download_zip_filename=download_ncbi_gtf(species,destination_path)
    except ElementClickInterceptedException:
        sleep(3)
        download_zip_filename=download_ncbi_gtf(species,destination_path)
        # NCBI 有时会弹出一个页面，让你选择是否要参与调查
        # 该页面会阻止程序正常运行，造成ElementClickInterceptedException，重新刷新一次即可

    try:
        unzip_ncbi_file(species,download_zip_filename,destination_path)
        # 解压并提取目标文件
        print(download_zip_filename)
    except BadZipFile:
        print("Error for species: "+species)
        for filename in os.listdir(destination_path):
            if filename.endswith(".zip"):
                os.remove(destination_path+filename)

流程

这里简单讲一下脚本工作流程：

进入NCBI官网后在下拉菜单中选择“Genome"，并在搜索框中输入物种名称
在这里插入图片描述

由于部分物种不能直接跳转到详情页，会先跳转到搜索结果或genome页，因此使用判断，直接点击推荐结果并进入，如下：
在这里插入图片描述
对应代码：

    if web.current_url.startswith("https://www.ncbi.nlm.nih.gov/search/all/?term="):
        search_result_a_tag=web.find_element(by='xpath',value='//*[@id="maincontent"]/div/div/div[1]/section/div[1]/div/div/div[1]/div/h4/a')
        search_result_a_tag.click()
    if web.current_url.startswith("https://www.ncbi.nlm.nih.gov/genome/?term="):
        search_result_a_tag=web.find_element(by='xpath',value='//*[@id="feat_title"]')
        search_result_a_tag.click()

跳转至物种详情信息页后判断“Download”按钮是否成功加载，加载完成后再进行后续步骤

selenium其实是有自带根据不同页面元素信息等条件来判断是否加载完成的，但由于此处的Downlaod按钮xpath并不固定，因此需要字符串来进行判断，也就使用了while循环

在这里插入图片描述
此处对应代码：

  if web.current_url.startswith("https://www.ncbi.nlm.nih.gov/datasets/taxonomy/"):
        click_download=False
        while not click_download:
            sleep(2)
            button_list=web.find_elements(by=By.TAG_NAME,value='button')
            try:
                for button in button_list:
                    if button.text.strip("'")=='Download':
                        button.click()
                        click_download=True
            except StaleElementReferenceException:
                break
            except NoSuchElementException:
                break