学习目标:
Python学习之批量从网上爬取文件
学习内容:
根据表格中给出的二百多个链接打开网页下载其中的gbff文件和faa文件, 从COG的fasta网页中下载所有csv.zip文件1的代码如下:文件名比较啰嗦,直接用网站上给的也行
import requests
import re
import xlrd
import os
from bs4 import BeautifulSoup
from distutils.filelist import findall
#get all url we need
def getAllUrl(fileName):
table1=[]
table2 = []
data=xlrd.open_workbook(fileName)
sheet1=data.sheet_by_name("Sheet1")
nrows=sheet1.nrows
for i in range(nrows):
table1.append(sheet1.cell(i,1).value)
table2.append(sheet1.cell(i,2).value)
return table1,table2
def getHtml(url, label, attr):
tempurl=[]
gbff=" "
faa=" "
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
for target in soup.find_all(label):
try:
value = target.get(attr)
tempstr = []
tempstr = value.split('.')
#print(tempstr[-2])
if (len(tempstr) < 4): # 因为我的细菌组的存储编号都含有一个'.'所以长度小于四的都可以排除掉
continue
if (tempstr[-2] == "gbff"):
gbff = value
#print(gbff)
if (tempstr[-2] == "faa"):
faa = value
#print(faa)
if (gbff != " " and faa != " "): # 网页中有多个faa和gbff文件选择最先出现的两个
break
except:
value = ' '
return gbff,faa
def download(url,savepath,bacname):
gbff,faa=getHtml(url,'a','href')
resource1 = requests.get(url+"//"+gbff)
resource2 = requests.get(url+"//"+faa)
with open(savepath+"gbff\\"+bacname+".gbff.zip",mode="wb") as fh1:
fh1.write(resource1.content)
fh1.close()
with open(savepath+"faa\\"+bacname+".faa.zip",mode="wb") as fh2:
fh2.write(resource2.content)
fh2.close()
if __name__ == '__main__':
bacnametable,urlTable=getAllUrl("F:\生物大数据\实验\Bacteria.xlsx")
savepath="D:\\BLAST\\blast-2.11.0+\\db\\"
num=len(urlTable)
#num=5
i=1
while(i!=num):
print(i)
#print(urlTable[i],bacnametable[i])
download(urlTable[i],savepath,bacnametable[i])
i+=1
2的代码如下:
import requests
import re
import os
from bs4 import BeautifulSoup
from distutils.filelist import findall
def getHtml(url, label, attr):
tempurl=[]
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
for target in soup.find_all(label):
try:
value = target.get(attr)
except:
value = ' '
if value:
tempurl.append(value)
return tempurl
def download(url,savepath):
tempurl=getHtml(url,'a','href')
for target in tempurl:#遍历所有网页上的url
tempstr=[]
tempstr=target.split('.')
if (len(tempstr) < 3):
continue
if(tempstr[-2] == "tsv"):
resource = requests.get(url+"//"+target)
with open(savepath+target,mode="wb") as fh:
fh.write(resource.content)
fh.close()
print(target)
else:
continue
if __name__ == '__main__':
path="https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/fasta/"
savepath="F:\生物大数据\实验\COG基因家族\\"
download(path,savepath)