threading + Queue多线程爬虫下载图片

import requests
from lxml import etree
from queue import Queue
import re
from urllib.request import urlretrieve
import threading
import time

class ImgSpider(object):

    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }
        self.url_list_q = Queue(100)
        for i in range(2,100):
            self.url_list_q.put("https://www.hahamx.cn/pic/new/{}".format(i))

        self.html_q = Queue(100)

        self.img_tit_url = Queue(8000)

        self.img_tit_src = Queue(8000)


    def get_list_html_q(self):#从列表页url队列中提取列表页url获取列表页html源码填充到列表页html源码队列
        while True:
            url = self.url_list_q.get()
            html = requests.get(url, headers=self.headers)
            time.sleep(1)
            if html.status_code == 200:
                self.html_q.put(html.text)
            self.url_list_q.task_done()

    def parse_list_html_q(self):#从列表页源码队列中提取源码获取到图片页标题和url填充到队列
        while True:
            text = self.html_q.get()
            html = etree.HTML(text)
            titles = html.xpath("//p[@class='word-wrap joke-main-content-text']/text()")
            img_urls = html.xpath("//div[@class='joke-main-content clearfix']//a[@target='_blank']/@href")
            img_urls = list(map(lambda url: "https://www.hahamx.cn" + url, img_urls))
            t_i = list(map(lambda a,b: (a,b), titles, img_urls))
            for i in t_i:
                self.img_tit_url.put(i)
            self.html_q.task_done()

    def parse_img_page(self):#从图片页标题和url队列中获取到图片的src填充到src队列
        while True:
            title, url = self.img_tit_url.get()
            text = requests.get(url, headers=self.headers).text
            time.sleep(1)
            html = etree.HTML(text)
            img_src = "https:" + html.xpath("//div[@class='joke-main-content clearfix']//img[@class='joke-main-content-img lazy']/@data-original")[0]
            self.img_tit_src.put((title, img_src))
            self.img_tit_url.task_done()

    def save_img(self):
        i = 0
        while True:
            title, src = self.img_tit_src.get()
            title = re.sub(r"[^\u4e00-\u9fa5]+", "", title)
            end_name = re.findall(r"_\d+\.([a-z]+)", src)[0]
            if title == "分享图片":
                title = "分享图片{}".format(i)
                file_name = "./data/imgs/" + title + "." + end_name
                i += 1
            else:
                file_name = "./data/imgs/" + title.strip() + "." + end_name
            urlretrieve(src, file_name)
            time.sleep(1)
            print(title + "." + end_name + "写入成功...")
            self.img_tit_src.task_done()


    def run(self):
        thread_list = []

        #创建3个线程执行获取列表页html源码的函数
        for i in range(3):
            t_get_list_html = threading.Thread(target=self.get_list_html_q)
            thread_list.append(t_get_list_html)

        #创建1个线程来解析列表页源码
        t_parse_list_html = threading.Thread(target=self.parse_list_html_q)
        thread_list.append(t_parse_list_html)

        #创建5个线程解析图片页源码
        for i in range(5):
            t_parse_img_page = threading.Thread(target=self.parse_img_page)
            thread_list.append(t_parse_img_page)

        #创建5个线程来保存图片到本地
        for i in range(5):
            t_save = threading.Thread(target=self.save_img)
            thread_list.append(t_save)

        for t in thread_list:
            t.setDaemon(True)
            t.start()

        q_list = [self.html_q, self.img_tit_url, self.img_tit_src]

        time.sleep(5)

        for q in q_list:
            q.join()

        print("主线程结束...")

isp = ImgSpider()
isp.run()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值