下载大文件时(这里只针对单一文件),可能会由于网络波动,时间不够等等原因,导致下载失败。
下面这串代码就是用于解决上述问题:
- 1.使用多线程加速下载,避免下载时间过长
- 2.实现断点续传,在因为各种原因中断导致下载失败后,能接着前面的文件继续下载。
# -*- codeing=utf-8 -*-
# @Time:2022/5/29 20:33
# @Author:Ye Zhoubing
# @File: download_large_file.py
# @software:PyCharm
"""
python 多线程下载大文件,并实现断点续传
"""
```python
# -*- codeing=utf-8 -*-
# @Time:2022/5/29 20:33
# @Author:Ye Zhoubing
# @File: download_large_file.py
# @software:PyCharm
"""
python 多线程下载大文件,并实现断点续传
"""
import os
import time
import httpx
from tqdm import tqdm
from threading import Thread
import datetime
import sys
class Logger(object):
def __init__(self, filename='default.log', stream=sys.stdout):
self.terminal = stream
self.log = open(filename, 'w' , encoding = 'utf-8')
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
pass
class DownloadFile(object):
def __init__(self, download_url, data_folder, thread_num):
"""
:param download_url: 文件下载连接
:param data_folder: 文件存储目录
:param thread_num: 开辟线程数量
"""
self.download_url = download_url
self.data_folder = data_folder
self.thread_num = thread_num
self.file_size = None
self.cut_size = None
self.tqdm_obj = None
self.thread_list = []
self.file_path = os.path.join(self.data_folder, download_url.split('/')[-1])
def downloader(self, etag, thread_index, start_index, stop_index, retry=False):
sub_path_file = "{}_{}".format(self.file_path, thread_index)
if os.path.exists(sub_path_file):
temp_size = os.path.getsize(sub_path_file) # 本地已经下载的文件大小
if not retry:
self.tqdm_obj.update(temp_size) # 更新下载进度条
else:
temp_size = 0
if stop_index == '-': stop_index = ""
headers = {'Range': 'bytes={}-{}'.format(start_index + temp_size, stop_index),
'ETag': etag, 'if-Range': etag,
}
down_file = open(sub_path_file, 'ab')
try:
with httpx.stream("GET", self.download_url, headers=headers) as response:
num_bytes_downloaded = response.num_bytes_downloaded
for chunk in response.iter_bytes():
if chunk:
down_file.write(chunk)
self.tqdm_obj.update(response.num_bytes_downloaded - num_bytes_downloaded)
num_bytes_downloaded = response.num_bytes_downloaded
except Exception as e:
print("Thread-{}:请求超时,尝试重连\n报错信息:{}".format(thread_index, e))
self.downloader(etag, thread_index, start_index, stop_index, retry=True)
finally:
down_file.close()
return
def get_file_size(self):
"""
获取预下载文件大小和文件etag
:return:
"""
with httpx.stream("GET", self.download_url) as response2:
etag = ''
total_size = int(response2.headers["Content-Length"])
for tltle in response2.headers.raw:
if tltle[0].decode() == "ETag":
etag = tltle[1].decode()
break
return total_size, etag
def cutting(self):
"""
切割成若干份
:param file_size: 下载文件大小
:param thread_num: 线程数量
:return:
"""
cut_info = {}
cut_size = self.file_size // self.thread_num
for num in range(1, self.thread_num + 1):
if num != 1:
cut_info[num] = [cut_size, cut_size * (num - 1) + 1, cut_size * num]
else:
cut_info[num] = [cut_size, cut_size * (num - 1), cut_size * num]
if num == self.thread_num:
cut_info[num][2] = '-'
return cut_info, cut_size
def write_file(self):
"""
合并分段下载的文件
:param file_path:
:return:
"""
if os.path.exists(self.file_path):
if len(self.file_path) >= self.file_size:
return
with open(self.file_path, 'ab') as f_count:
for thread_index in range(1, self.thread_num + 1):
with open("{}_{}".format(self.file_path, thread_index), 'rb') as sub_write:
f_count.write(sub_write.read())
# 合并完成删除子文件
os.remove("{}_{}".format(self.file_path, thread_index))
return
def create_thread(self, etag, cut_info):
"""
开辟多线程下载
:param file_path: 文件存储路径
:param etag: headers校验
:param cut_info:
:return:
"""
for thread_index in range(1, self.thread_num + 1):
thread = Thread(target=self.downloader,
args=(etag, thread_index, cut_info[thread_index][1], cut_info[thread_index][2]))
thread.setName('Thread-{}'.format(thread_index))
thread.setDaemon(True)
thread.start()
self.thread_list.append(thread)
for thread in self.thread_list:
thread.join()
return
def check_thread_status(self):
"""
查询线程状态。
:return:
"""
while True:
for thread in self.thread_list:
thread_name = thread.getName()
if not thread.isAlive():
print("{}:已停止".format(thread_name))
time.sleep(1)
def create_data(self):
if not os.path.exists(self.data_folder):
os.mkdir(self.data_folder)
return
def main(self):
# 平分几份
self.create_data()
self.file_size, etag = self.get_file_size()
# 按线程数量均匀切割下载文件
cut_info, self.cut_size = self.cutting()
# 下载文件名称
# 创建下载进度条
self.tqdm_obj = tqdm(total=self.file_size, unit_scale=True, desc=self.file_path.split('/')[-1],
unit_divisor=1024,
unit="B")
# 开始多线程下载
self.create_thread(etag, cut_info)
# 合并多线程下载文件
self.write_file()
return
if __name__ == '__main__':
# 将控制台print的报错结果输出到log.txt文件
sys.stdout = Logger(r'log.txt', sys.stdout) #不希望生成log文件注释掉即可
# sys.stderr = Logger(r'log_file.txt', sys.stderr)
start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("开始时间:"+start_time)
print("==" * 20)
download_url = "https://heyulei1.github.io/videos/1.mp4"
data_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Data')
thread_num = 20 # 想提高速度可以提高线程数,但不要太高,这与电脑配置有关
downloader = DownloadFile(download_url, data_folder, thread_num)
downloader.main()
print(download_url,'完成')
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("==" * 20)
print("结束时间:"+end_time+"\n")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.