Python requests下载超大文件/批量下载文件

Rick_M359

已于 2022-12-31 22:10:01 修改

阅读量3.2k

点赞数 1

文章标签： python 开发语言 Powered by 金山文档

于 2022-12-31 21:59:27 首次发布

本文链接：https://blog.csdn.net/Rick_M/article/details/128508994

版权

（一）下载超大文件：

使用 python 下载超大文件，直接全部下载，文件过大，可能会造成内存不足，这时候要使用 requests 的 stream 模式

主要代码如下

iter_content：一块一块的遍历要下载的内容

iter_lines：一行一行的遍历要下载的内容

def download_file(url, file_pname, chunk_size=1024*4):
    """
    url: file url
    file_pname: file save path
    chunk_size: chunk size
    """# 第一种
    response_data_file = requests.get(url, stream=True)
    with open(file_pname, 'wb') as f:
        for chunk in response_data_file.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
    
    # 第二种with requests.get(url, stream=True) as req:
        with open(file_pname, 'wb') as f:
            for chunk in req.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)

# 下载大文件 应用实例：
def Big_Download(session,url_inquire,headers,form_data):
    response = session.post(url=url_inquire,data=form_data,headers=headers,verify=False,stream=True)
    # 获取文件大小
    file_size = int(response.headers['content-length'])
    with tqdm(total=file_size, unit='B', unit_scale=True, unit_divisor=1024, ascii=True, desc='Expense.json') as bar:
        with session.post(url=url_inquire,data=form_data,headers=headers,verify=False,stream=True) as r:
            with open('Expense.json', 'wb') as fp:
                for chunk in r.iter_content(chunk_size=512):
                    if chunk:
                        fp.write(chunk)
                        bar.update(len(chunk))

（二）批量下载文件：

#批量文件下载
import requests
from bs4 import BeautifulSoup

archive_url = "http://www-personal.umich.edu/~csev/books/py4inf/media/"
def get_links():
    r = requests.get(archive_url)
    soup = BeautifulSoup(r.content, 'html5lib')
    links = soup.findAll('a')
    video_links = [archive_url + link['href'] for link in links if link['href'].endswith('mp4')]

    return video_links

def download_series(video_links):
    for link in video_links:
        file_name = link.split('/')[-1]
        print("Downloading file:%s" % file_name)
        r = requests.get(link, stream=True)
        # download started
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                f.write(chunk)
        print("%s downloaded!\n" % file_name)
        print("All videos downloaded!")
        return

if __name__ == "__main__":
    video_links = get_links()
    download_series(video_links)