多线程下载

funcdefmain

已于 2023-08-28 19:37:00 修改

阅读量112

点赞数

分类专栏： python 爬虫文章标签：爬虫

于 2023-05-09 17:28:17 首次发布

本文链接：https://blog.csdn.net/qq_45444679/article/details/130584814

版权

python 同时被 2 个专栏收录

14 篇文章 1 订阅

订阅专栏

爬虫

12 篇文章 1 订阅

订阅专栏

多线程下载

import os.path
import shutil
from threading import Thread, Lock
import traceback

import requests
from queue import Queue


class Spider:
    def __init__(self, task_list, thread_num):
        """
        多线程下载
        :param task_list: [(url,save_path),(url,save_path),...]
        :param thread_num: 线程数
        """
        self.tasks = Queue()  # 待爬取的链接队列
        self.task_list = task_list
        self.thread_num = thread_num
        self.lock = Lock()  # 线程锁
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

    def urls_in(self):
        """url入队列"""
        for task in self.task_list:
            self.tasks.put(task)

    def download(self, url, save_path):
        if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
            print(f'{url} 已下载，跳过\n')
            return
        print(f'下载链接: {url} \n')
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        resp = requests.get(url, headers=self.headers, stream=True, timeout=10)
        with open(save_path, 'wb') as f:
            shutil.copyfileobj(resp.raw, f)

    def write_failed_txt(self, task):
        with open('failed.txt', 'a', encoding='utf-8', ) as f:
            f.write(str(task))
            f.write(',')
            f.write('\n')

    # 爬取页面并处理结果
    def crawl(self):
        while not self.tasks.empty():
            task = self.tasks.get()
            try:
                self.download(url=task[0], save_path=task[1])
            except:
                # 下载失败，将失败任务写入文件
                self.write_failed_txt(task)
                print(traceback.format_exc())
            finally:
                self.tasks.task_done()

    # 启动多线程爬虫
    def run(self):
        self.urls_in()
        for i in range(self.thread_num):
            t = Thread(target=self.crawl)
            t.start()
        self.tasks.join()


if __name__ == '__main__':
    spider = Spider(
        task_list=[
            ('https://assets.mixkit.co/active_storage/sfx/833/833-preview.mp3',
             './mixkit/More/Construction/Metal hammer hit.mp3'),
        ],
        thread_num=4
    )
    spider.run()