正好就是一个简单的爬虫任务,就分享个代码
#需求:爬取http://ycb-benchmarks.s3-website-us-east-1.amazonaws.com/的链接并下载文件
import requests
from bs4 import BeautifulSoup
import time
def downloadFile(name, url):
headers = {'Proxy-Connection':'keep-alive'}
r = requests.get(url, stream=True, headers=headers)
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size =1024):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
time1 = time.time()
f.close()
def formatFloat(num):
return '{:.2f}'.format(num)
#设置url
splice_url = 'http://ycb-benchmarks.s3-website-us-east-1.amazonaws.com/'#需要拼接用来下载的字符
url = 'http://ycb-benchmarks.s3-website-us-east-1.amazonaws.com/'
include_http_str = 'http'
include_scripts_str = 'scripts'
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')
for k in soup.find_all('a'):#获取所有的a标签
#处理没用的链接,获得自己需要下载的链接
if k['href'].find(include_http_str) < 0:
if k['href'].find(include_scripts_str) < 0:
cur_str = splice_url + k['href'] #拼接下载好的链接
cur_count = len(cur_str.split('/'))
download_name = cur_str.split('/')[cur_count-1] #设置保存到本地的文件名
downloadFile(download_name,cur_str) #使用写好的下载脚本直接下载文件
比较简单的代码,但是处理起来还是很方便的。