python库requests文件、视频多线程池分块下载

云霄IT

于 2024-07-27 13:02:53 发布

阅读量92

点赞数 1

文章标签： python 音视频开发语言

本文链接：https://blog.csdn.net/weixin_51111267/article/details/140733944

版权

一、分块下载

特点：可以实现断点续下

import os
import requests
def download_file_with_resume(url, dest_path, chunk_size):
    # 获取文件的总大小
    response = requests.head(url)
    total_size = int(response.headers.get('content-length', 0))
    # 检查文件是否已部分下载
    downloaded_size = 0
    if os.path.exists(dest_path):
        downloaded_size = os.path.getsize(dest_path)
    with open(dest_path, 'ab') as f:
        # 从上次下载的位置继续下载
        for start in range(downloaded_size, total_size, chunk_size):
            end = min(start + chunk_size - 1, total_size - 1)
            headers = {'Range': f'bytes={start}-{end}'}
            part_response = requests.get(url, headers=headers, stream=True)
            total_size = int(part_response.headers.get('content-length', 0))
            print("total_size:",total_size)
            if part_response.status_code == 206 or part_response.status_code == 200:
                f.write(part_response.content)
            else:
                raise Exception(f"Failed to download chunk: {part_response.status_code}")
    print(f"Download completed: {dest_path}")

# 示例用法
url = 'https://xxxx.com//video.mp4'
dest_path = 'abc.mp4'
chunk_size = 1024*1024*1    # 1MB
download_file_with_resume(url, dest_path,chunk_size)

二、线程池分块下载（大幅提高下载速度）

特点：下载速度快，1GB几十秒内下完

import os
import requests
import math
import threadpool
import time
import shutil
class BlockDownload:
    def __init__(self):
        self.error_try = 5    # 请求下载错误重试次数
        self.wait_time = 3   # 请求下载错误等待时间
        self.proxies = {'http':'127.0.0.1:7890','https':'127.0.0.1:7890'}
        self.timeout = 6
    def download_chunk(self,url, start_byte, end_byte,chunk_num, output_file_path):
        """
        下载文件的一个块并保存到临时文件。
        """
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/547.34 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/547.34',
            'Range':f'bytes={start_byte}-{end_byte}'
        }
        for i in range(self.error_try):
            try:
                response = requests.get(url, headers=headers,timeout=self.timeout,proxies=self.proxies,stream=True)
                if response.status_code == 206 or response.status_code == 200:
                    with open(f'{output_file_path}.part{chunk_num}', 'wb') as file:
                        file.write(response.content)
                    print(f"块 {chunk_num} 下载完成。")
                    return
                else:
                    print(f"块 {chunk_num} 下载失败，状态码: {response.status_code}")
                    continue
            except Exception as e:
                print(f"块 {chunk_num} 下载错误，{i} 次重试!",e)
                time.sleep(self.wait_time)
                continue
        print(f"块 {chunk_num} 下载失败，程序结束!")
        exit(0)
    def merge_chunks(self,output_file_path, num_chunks):
        """
        合并所有下载的块到最终文件。
        """
        with open(output_file_path, 'wb') as output_file:
            for i in range(num_chunks):
                temp_file_path = f'{output_file_path}.part{i}'
                with open(temp_file_path, 'rb') as temp_file:
                    shutil.copyfileobj(temp_file,output_file)
                os.remove(temp_file_path)
        print(f"所有块已合并为: {output_file_path}")

    def get_optimal_chunk_size(slef,total_size):
        """
        获取最佳分块大小
        """
        if total_size < 64 * 1024 * 1024:  # 小于64MB
            return 1 * 1024 * 1024  # 1MB
        elif total_size < 512 * 1024 * 1024:  # 小于512MB
            return 2 * 1024 * 1024  # 2MB
        elif total_size < 1024 * 1024 * 1024:  # 小于1024MB
            return 4 * 1024 * 1024  # 4MB
        else:
            return 8 * 1024 * 1024  # 8MB
    def thread_start(self,url,file_path,thread_num):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/547.34 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/547.34'
        }
        response = requests.head(url,headers=headers,timeout=self.timeout,proxies=self.proxies)
        total_size = int(response.headers.get('content-length', 0))
        if not total_size:
            response = requests.head(response.headers.get('Location', url), headers=headers, timeout=self.timeout, proxies=self.proxies)
            total_size = int(response.headers.get('content-length', 0))
        if not total_size:
            print(f"视频大小为：{total_size}")
            return
        print(f"文件总大小:{total_size},开始下载...")
        chunk_size = self.get_optimal_chunk_size(total_size)
        block_num = math.ceil(total_size / chunk_size)  # 总分块数
        arguments_list = []  # 创建存放任务参数列表
        for i in range(block_num):
            start_byte = i * chunk_size
            end_byte = start_byte + chunk_size - 1
            if i == block_num - 1:
                end_byte = total_size - 1  # 最后一个块可能会包含剩余的所有字节
            arguments_list.append(([url, start_byte, end_byte, i, file_path], None))
        pool = threadpool.ThreadPool(thread_num)  # 创建线程
        tasks_list = threadpool.makeRequests(self.download_chunk, arguments_list)  # 按照参数列表长度创建任务列表
        for task in tasks_list:
            pool.putRequest(task)  # 将要执行的任务放入线程池
        pool.wait()
        self.merge_chunks(file_path, block_num)

    def run(self):
        download_url = 'https://xxxx.com/video.mp4'
        file_path = r'D:\Video\xxx.mp4'
        thread_num = 32  # 线程数，最好根据视频大小、带宽设置
        self.thread_start(download_url,file_path,thread_num)

if __name__ == '__main__':
    bd = BlockDownload()
    bd.run()