生物大数据实验-3

学习目标:

Python学习之批量从网上爬取文件

学习内容:

根据表格中给出的二百多个链接打开网页下载其中的gbff文件和faa文件, 从COG的fasta网页中下载所有csv.zip文件
1的代码如下:文件名比较啰嗦,直接用网站上给的也行
import requests
import re
import xlrd
import os
from bs4 import BeautifulSoup
from distutils.filelist import findall

#get all url we need
def getAllUrl(fileName):
    table1=[]
    table2 = []
    data=xlrd.open_workbook(fileName)
    sheet1=data.sheet_by_name("Sheet1")
    nrows=sheet1.nrows
    for i in range(nrows):
        table1.append(sheet1.cell(i,1).value)
        table2.append(sheet1.cell(i,2).value)
    return table1,table2

def getHtml(url, label, attr):
    tempurl=[]
    gbff=" "
    faa=" "
    response = requests.get(url)
    response.encoding = 'utf-8'
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    for target in soup.find_all(label):
        try:
            value = target.get(attr)
            tempstr = []
            tempstr = value.split('.')
            #print(tempstr[-2])
            if (len(tempstr) < 4):  # 因为我的细菌组的存储编号都含有一个'.'所以长度小于四的都可以排除掉
                continue
            if (tempstr[-2] == "gbff"):
                gbff = value
                #print(gbff)
            if (tempstr[-2] == "faa"):
                faa = value
                #print(faa)
            if (gbff != " " and faa != " "):  # 网页中有多个faa和gbff文件选择最先出现的两个
                break
        except:
            value = ' '
    return gbff,faa

def download(url,savepath,bacname):
    gbff,faa=getHtml(url,'a','href')
    resource1 = requests.get(url+"//"+gbff)
    resource2 = requests.get(url+"//"+faa)
    with open(savepath+"gbff\\"+bacname+".gbff.zip",mode="wb") as fh1:
        fh1.write(resource1.content)
        fh1.close()
    with open(savepath+"faa\\"+bacname+".faa.zip",mode="wb") as fh2:
        fh2.write(resource2.content)
        fh2.close()

if __name__ == '__main__':
    bacnametable,urlTable=getAllUrl("F:\生物大数据\实验\Bacteria.xlsx")
    savepath="D:\\BLAST\\blast-2.11.0+\\db\\"
    num=len(urlTable)
    #num=5
    i=1
    while(i!=num):
        print(i)
        #print(urlTable[i],bacnametable[i])
        download(urlTable[i],savepath,bacnametable[i])
        i+=1

2的代码如下:

import requests
import re
import os
from bs4 import BeautifulSoup
from distutils.filelist import findall


def getHtml(url, label, attr):
    tempurl=[]
    response = requests.get(url)
    response.encoding = 'utf-8'
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    for target in soup.find_all(label):
        try:
            value = target.get(attr)
        except:
            value = ' '
        if value:
            tempurl.append(value)
    return tempurl

def download(url,savepath):
    tempurl=getHtml(url,'a','href')
    for target in tempurl:#遍历所有网页上的url
        tempstr=[]
        tempstr=target.split('.')
        if (len(tempstr) < 3):
            continue
        if(tempstr[-2] == "tsv"):
            resource = requests.get(url+"//"+target)
            with open(savepath+target,mode="wb") as fh:
                fh.write(resource.content)
                fh.close()
            print(target)
        else:
            continue
if __name__ == '__main__':
    path="https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/fasta/"
    savepath="F:\生物大数据\实验\COG基因家族\\"
    download(path,savepath)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值