python多线程爬取图虫网图片

该博客展示了如何使用Python进行多线程爬取图虫网站上的图片。代码中定义了Producer和Consumer两个线程类,Producer负责从网页中获取图片URL并放入队列,Consumer则从队列中取出URL并下载图片。用户输入关键字和爬取页数,程序将按照指定条件下载相应页面的图片。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

python多线程爬取图虫网图片


直接上代码

import requests
import re
from urllib import request
import os
import threading
import queue

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'
}

class Producer(threading.Thread): 
    def __init__(self,page_queue,image_queue,title,i,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.image_queue = image_queue
        self.title = title
        self.i = i

    def run(self):
        while not self.page_queue.empty():
            url = self.page_queue.get()
            print(url)
            i = str(self.i)
            req = requests.get(url,headers=headers)
            # print(req.content.decode('utf8'))
            req = req.content.decode('utf8')
            image_ids = imageurls(req)
            # print(image_id)

            ''' 自行修改图片保存地址 '''
            dirpath = os.path.join('E:/图虫创意',self.title)
            if not os.path.exists(dirpath):
                os.mkdir(dirpath)

            for x,image_id in enumerate(image_ids):
                image_url = 'https://weiliicimg9.pstatp.com/weili/sm/' + image_id + '.jpeg'
                # print(image_url)
                self.image_queue.put({'image_url':image_url,'image_path':os.path.join(dirpath, self.title + '_' + i + '_%d.jpg'%x)})

class Consumer(threading.Thread):
    def __init__(self,image_queue,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.image_queue = image_queue

    def run(self):
        while True:
            try:
                image_obj = self.image_queue.get(timeout=10)                                  
                imageurl = image_obj.get('image_url')
                imagepath = image_obj.get('image_path')
                try:
                    request.urlretrieve(imageurl,imagepath)
                    print(imagepath + "下载完成!")
                except:
                    print(imagepath+"下载失败!")
            except:
                break      

def imageurls(req):
    image_id = re.findall(r'("\w+":"\d+")',req)
    image_id = ''.join(image_id)
    image_ids = re.findall(r"(\d+)",image_id)
    return image_ids

def main():

    title = input('请输入关键字:')
    page = int(input('请输入爬取页数:'))

    page_queue = queue.Queue(page)
    image_queue = queue.Queue(1000)
    for i in range(0,page):
        url = 'https://stock.tuchong.com/search?page={}&platform=image&size=100&sortBy=0&term='.format(i) + title
        page_queue.put(url)

        for x in range(3):
            th = Producer(page_queue,image_queue,title,i,name="生产者%d号"%x)
            th.start()
        
        for x in range(5):
            th = Consumer(image_queue,name="消费者%d号"%x)
            th.start()



if __name__  == '__main__':
    main()
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值