生物大数据实验-3

最新推荐文章于 2024-07-29 09:10:44 发布

布鲁布鲁ฅ

最新推荐文章于 2024-07-29 09:10:44 发布

阅读量165

点赞数

文章标签：大数据

本文链接：https://blog.csdn.net/m121388/article/details/116671973

版权

学习目标：

Python学习之批量从网上爬取文件

学习内容：

根据表格中给出的二百多个链接打开网页下载其中的gbff文件和faa文件, 从COG的fasta网页中下载所有csv.zip文件

1的代码如下：文件名比较啰嗦，直接用网站上给的也行

import requests
import re
import xlrd
import os
from bs4 import BeautifulSoup
from distutils.filelist import findall

#get all url we need
def getAllUrl(fileName):
    table1=[]
    table2 = []
    data=xlrd.open_workbook(fileName)
    sheet1=data.sheet_by_name("Sheet1")
    nrows=sheet1.nrows
    for i in range(nrows):
        table1.append(sheet1.cell(i,1).value)
        table2.append(sheet1.cell(i,2).value)
    return table1,table2

def getHtml(url, label, attr):
    tempurl=[]
    gbff=" "
    faa=" "
    response = requests.get(url)
    response.encoding = 'utf-8'
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    for target in soup.find_all(label):
        try:
            value = target.get(attr)
            tempstr = []
            tempstr = value.split('.')
            #print(tempstr[-2])
            if (len(tempstr) < 4):  # 因为我的细菌组的存储编号都含有一个'.'所以长度小于四的都可以排除掉
                continue
            if (tempstr[-2] == "gbff"):
                gbff = value
                #print(gbff)
            if (tempstr[-2] == "faa"):
                faa = value
                #print(faa)
            if (gbff != " " and faa != " "):  # 网页中有多个faa和gbff文件选择最先出现的两个
                break
        except:
            value = ' '
    return gbff,faa

def download(url,savepath,bacname):
    gbff,faa=getHtml(url,'a','href')
    resource1 = requests.get(url+"//"+gbff)
    resource2 = requests.get(url+"//"+faa)
    with open(savepath+"gbff\\"+bacname+".gbff.zip",mode="wb") as fh1:
        fh1.write(resource1.content)
        fh1.close()
    with open(savepath+"faa\\"+bacname+".faa.zip",mode="wb") as fh2:
        fh2.write(resource2.content)
        fh2.close()

if __name__ == '__main__':
    bacnametable,urlTable=getAllUrl("F:\生物大数据\实验\Bacteria.xlsx")
    savepath="D:\\BLAST\\blast-2.11.0+\\db\\"
    num=len(urlTable)
    #num=5
    i=1
    while(i!=num):
        print(i)
        #print(urlTable[i],bacnametable[i])
        download(urlTable[i],savepath,bacnametable[i])
        i+=1

2的代码如下：

import requests
import re
import os
from bs4 import BeautifulSoup
from distutils.filelist import findall


def getHtml(url, label, attr):
    tempurl=[]
    response = requests.get(url)
    response.encoding = 'utf-8'
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    for target in soup.find_all(label):
        try:
            value = target.get(attr)
        except:
            value = ' '
        if value:
            tempurl.append(value)
    return tempurl

def download(url,savepath):
    tempurl=getHtml(url,'a','href')
    for target in tempurl:#遍历所有网页上的url
        tempstr=[]
        tempstr=target.split('.')
        if (len(tempstr) < 3):
            continue
        if(tempstr[-2] == "tsv"):
            resource = requests.get(url+"//"+target)
            with open(savepath+target,mode="wb") as fh:
                fh.write(resource.content)
                fh.close()
            print(target)
        else:
            continue
if __name__ == '__main__':
    path="https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/fasta/"
    savepath="F:\生物大数据\实验\COG基因家族\\"
    download(path,savepath)

布鲁布鲁ฅ

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
生物大数据实验-3

学习目标：Python学习之批量从网上爬取文件学习内容：1、根据表格中给出的二百多个链接打开网页下载其中的gbff文件和faa文件2、从COG的fasta网页中下载所有csv.zip文件1的代码如下：import requestsimport reimport xlrdimport osfrom bs4 import BeautifulSoupfrom distutils.filelist import findall#get all url we needdef getA
复制链接

扫一扫