爬虫练习--多线程 抓取图片

斗图网

单线程

# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import urllib.request


def parse_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
    }
    try:
        resp = requests.get(url, headers=headers)
        text = resp.text
        img_urls = re.findall(r'data-original="(.*?)"', text)
        img_names = re.findall(r' alt="(.*?)"', text)

        for i,j in zip(img_urls, img_names):
            j = re.sub(r"[\?\.]", "", j)
            img_name = j + os.path.splitext(i)[1]
            urllib.request.urlretrieve(i, "images/"+img_name)
            print(img_name + "     下载完成")
    except Exception as e:
        print(e)


def main():
    t1 = time.time()
    for i in range(5):
        url = rf"http://www.doutula.com/photo/list/?page={i}"
        parse_page(url)

    t2 = time.time()

    t = int(t2) - int(t1)

    print(f"共耗时{t}")


if __name__ == '__main__':
    main()

多线程

# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import urllib.request
import threading
from queue import Queue


class Producer(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
    }
    proxys = {
        "http": "http://60.5.254.169:8081"
    }
    def __init__(self, page_queue, image_queue, *args, **kwargs):
        threading.Thread.__init__(self, *args, **kwargs)
        self.image_queue = image_queue
        self.page_queue = page_queue
      #  print("生产者初始化")

    def run(self):
     #   print("run")
        while 1:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)
            print(url)

    def parse_page(self, url):
      #  print("开始解析")
        resp = requests.get(url, headers=self.headers, proxies=self.proxys)
        text = resp.text
        img_urls = re.findall(r'data-original="(.*?)"', text)
        img_names = re.findall(r' alt="(.*?)"', text)

        for i,j in zip(img_urls, img_names):
            j = re.sub(r"[\?\.]", "", j)
            img_name = j + os.path.splitext(i)[1]
            self.image_queue.put((i, img_name))


class Consumer(threading.Thread):
    def __init__(self, page_queue, image_queue,*args, **kwargs):
        threading.Thread.__init__(self, *args, **kwargs)
        self.image_queue = image_queue
        self.page_queue = page_queue

    def run(self) -> None:
        while 1:
            if self.image_queue.empty() and self.page_queue.empty():
                break
            img_url, filename = self.image_queue.get()
            urllib.request.urlretrieve(img_url, "images/" + filename)
            print(filename+"   下载完成")
        

def main():
    print("程序开始")
    page_queue = Queue(500)
    img_queue = Queue(1000)

    for i in range(2,10):
        url = rf"http://www.doutula.com/photo/list/?page={i}"
        page_queue.put(url)

    for i in range(5):
        t1 = Producer(page_queue, img_queue)
       #  print("生产者")
        t1.start()

    for i in range(3):
        t = Consumer(page_queue, img_queue)
        t.start()
       # print("消费者")


if __name__ == '__main__':
    main()


# 测试的时候ip被封了。找代理有点麻烦,跳过了

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值