多线程下载

多线程下载

import os.path
import shutil
from threading import Thread, Lock
import traceback

import requests
from queue import Queue


class Spider:
    def __init__(self, task_list, thread_num):
        """
        多线程下载
        :param task_list: [(url,save_path),(url,save_path),...]
        :param thread_num: 线程数
        """
        self.tasks = Queue()  # 待爬取的链接队列
        self.task_list = task_list
        self.thread_num = thread_num
        self.lock = Lock()  # 线程锁
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

    def urls_in(self):
        """url入队列"""
        for task in self.task_list:
            self.tasks.put(task)

    def download(self, url, save_path):
        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
            print(f'{url} 已下载,跳过\n')
            return
        print(f'下载链接: {url} \n')
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        resp = requests.get(url, headers=self.headers, stream=True, timeout=10)
        with open(save_path, 'wb') as f:
            shutil.copyfileobj(resp.raw, f)

    def write_failed_txt(self, task):
        with open('failed.txt', 'a', encoding='utf-8', ) as f:
            f.write(str(task))
            f.write(',')
            f.write('\n')

    # 爬取页面并处理结果
    def crawl(self):
        while not self.tasks.empty():
            task = self.tasks.get()
            try:
                self.download(url=task[0], save_path=task[1])
            except:
                # 下载失败,将失败任务写入文件
                self.write_failed_txt(task)
                print(traceback.format_exc())
            finally:
                self.tasks.task_done()

    # 启动多线程爬虫
    def run(self):
        self.urls_in()
        for i in range(self.thread_num):
            t = Thread(target=self.crawl)
            t.start()
        self.tasks.join()


if __name__ == '__main__':
    spider = Spider(
        task_list=[
            ('https://assets.mixkit.co/active_storage/sfx/833/833-preview.mp3',
             './mixkit/More/Construction/Metal hammer hit.mp3'),
        ],
        thread_num=4
    )
    spider.run()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值