使用request库
小下载: 需要一次性写到内存,花费一定空间,然后写入磁盘。
import requests
image_url = "https://www.python.org/static/community_logos/python-logo-master-v3-TM.png"
r = requests.get(image_url)
with open("python_logo.png",'wb') as f:
f.write(r.content)
大文件下载:
分块写入到磁盘中,需要的内存固定,但如果块太小的话,程序效率低。
import requests
file_url = "http://codex.cs.yale.edu/avi/db-book/db4/slide-dir/ch1-2.pdf"
r = requests.get(file_url, stream=True)
with open("python.pdf", "wb") as pdf:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
pdf.write(chunk)
**文件批量下载:**通过解析器解析出需要的元素,然后过滤文件名。
import requests
from bs4 import BeautifulSoup
archive_url = "http://www-personal.umich.edu/~csev/books/py4inf/media/"
def get_video_links():
r = requests.get(archive_url)
soup = BeautifulSoup(r.content, 'html5lib')
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('mp4')]
return video_links
def download_video_series(video_links):
for link in video_links:
file_name = link.split('/')[-1]
print("Downloading file:%s" % file_name)
r = requests.get(link, stream=True)
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
f.write(chunk)
print("%s downloaded!\n" % file_name)
print("All videos downloaded!")
return
if name == “main”:
video_links = get_video_links()
download_video_series(video_links)
一个错误的例子:
每次1字节1字节的写,浪费了大量时间。
实现代码
#-*- coding: UTF-8 -*-
import requests
from contextlib import closing
class ProgressBar(object):
def __init__(self, title, count=0.0, run_status=None, fin_status=None, total=100.0, unit='', sep='/', chunk_size=1.0):
super(ProgressBar, self).__init__()
self.info = "[%s] %s %.2f %s %s %.2f %s"
self.title = title
self.total = total
self.count = count
self.chunk_size = chunk_size
self.status = run_status or ""
self.fin_status = fin_status or " " * len(self.status)
self.unit = unit
self.seq = sep
def __get_info(self):
#[名称] 状态 进度 单位 分割线 总数 单位
_info = self.info % (self.title, self.status, self.count/self.chunk_size, self.unit, self.seq, self.total/self.chunk_size, self.unit)
return _info
def refresh(self, count = 1, status = None):
self.count += count
self.status = status or self.status
end_str = "\r"
if self.count >= self.total:
end_str = '\n'
self.status = status or self.fin_status
print(self.__get_info(), end=end_str, )
if __name__ == '__main__':
#url = 'http://www.demongan.com/source/game/二十四点.zip'
#filename = '二十四点.zip'
url = input('请输入需要下载的文件链接:\n')
filename = url.split('/')[-1]
# 处理一个response
with closing(requests.get(url, stream=True)) as response:
chunk_size = 1024 # 块大小1024
content_size = int(response.headers['content-length']) # 获取长度
if response.status_code == 200:
print('文件大小:%0.2f KB' % (content_size / chunk_size))
progress = ProgressBar("%s下载进度" % filename
, total = content_size
, unit = "KB"
, chunk_size = chunk_size
, run_status = "正在下载"
, fin_status = "下载完成")
with open(filename, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
progress.refresh(count=len(data))
else:
print('链接异常')