python爬虫九：多线程爬虫

最新推荐文章于 2024-07-10 18:18:26 发布

慢羊羊6379.*?

最新推荐文章于 2024-07-10 18:18:26 发布

阅读量245

点赞数

分类专栏： python爬虫学习

本文链接：https://blog.csdn.net/weixin_49088841/article/details/107996555

版权

python爬虫学习专栏收录该内容

27 篇文章 21 订阅

订阅专栏

1、多线程爬虫

①生产者和消费者模式是多线程开发中常见的一种模式。通过生产者和消费者模式，可以让代码使每个线程达到高内聚的目标，线程管理更加方便，程序分工更加明确。
②生产者的线程专门用来生产一些数据，然后存放到容器中(中间变量)。消费者在从这个中间的容器中取出数据进行消费
在这里插入图片描述

1.1Lock版的生产者和消费者

gMoney = 0
gTime = 0
gLock = threading.Lock()
class Product(threading.Thread):
    # global gMoney,gTime
    def run(self):
        # time.sleep(5)
        # gLock.acquire()
        global gMoney,gTime
        while gTime<10:
            gLock.acquire()
            money = random.randint(1,10)
            # money = random.randint(1,10)
            gMoney += money
            print('%s生成了%d元钱' % (threading.current_thread().name, money))
            gTime += 1
            # time.sleep(10)
            time.sleep(1)
            gLock.release()
        # gLock.release()
class Custom(threading.Thread):
    def run(self):
        # time.sleep(5)
        # gLock.acquire()
        while True:
            global gMoney,gTime
            gLock.acquire()
            money = random.randint(1,100)
            if money<=gMoney:
                gMoney -= money
                print('%s消费了%d元钱' % (threading.current_thread().name, money))
            elif gTime>=10:
                gLock.release()
                break
            else:
                print('%s消费了%d元钱，但是余额只有%d'%(threading.current_thread().name,money,gMoney))
            # time.sleep(10)
            gLock.release()
            time.sleep(1)
        # gLock.release()

def main():
    p1 = Product(name='生产者1号')
    p3 = Product(name='生产者2号')
    p4 = Product(name='生产者3号')
    p1.start()
    p3.start()
    p4.start()
    # for i in range(5):
    #     p1 = Product(name='生产者%d号'%i)
    #     p1.start()
    #     # time.sleep(2)
    for i in range(5):
        p2 = Custom(name='消费者%d号' % i)
        p2.start()
        # time.sleep(2)
if __name__ == '__main__':
    main()

1.2Condition版的生产者和消费者

gMoney = 0
# 定义一个变量 保存生成的次数 默认是0次
gTimes = 0
# 定义一把锁
# gLock = threading.Lock()
gCond = threading.Condition()
# 定义生产者
class Producer(threading.Thread):
    def run(self) -> None:
        global gMoney
        global gTimes
        while True:
            gCond.acquire() # 上锁

            if gTimes >= 10:
                gCond.release()
                break
            money = random.randint(0,100) # 0 <= money <= 100
            gMoney += money
            gTimes += 1
            print('%s生成了%d元钱,剩余%d元' % (threading.current_thread().name, money,gMoney))
            gCond.notify_all()
            gCond.release() # 解锁
            time.sleep(1)
# 定义消费者
class Consumer(threading.Thread):
    def run(self) -> None:
        global gMoney
        while True:
            gCond.acquire()  # 上锁
            money = random.randint(0, 100)  # 0 <= money <= 100
            while gMoney < money:
                if gTimes >= 10:
                    gCond.release()
                    return  # 这里如果用break退出了内层循环，但是外层循环没有退出，直接用return
                print('%s消费了%d元钱，但是余额只有%d元了。生产者也不在生成了' % (threading.current_thread().name, money,gMoney))
                gCond.wait()
            # 开始消费
            gMoney -= money
            print('%s消费了%d元钱，剩余%d元' % (threading.current_thread().name, money,gMoney))
            gCond.release()  # 解锁
            time.sleep(1)
def main():
    # 开启5个生产者
    for i in range(5):
        th = Producer(name='生产者%d号'%i)
        th.start()
    # 开启5个消费者
    for i in range(5):
        th = Consumer(name='消费者%d号' % i)
        th.start()
if __name__ == '__main__':
    main()

2、普通方法爬取王者荣耀首页图片

import requests
import os
import random
from urllib import parse
from urllib import request

headers_list = [{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'},{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'},{'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}]
header = {**random.choice(headers_list),**{'referer':'https://pvp.qq.com/web201605/wallpaper.shtml'}}
def get_url(data):
    images = []
    for x in range(1, 9):
        image_url = parse.unquote(data['sProdImgNo_%d' % x]).replace('200', '0')
        images.append(image_url)
    return images


def main():
    base_url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1597325146548'
    response = requests.get(base_url,headers = header,verify=False).json()
    datas = response['List']
    print(datas,len(datas))#20条数据
    for data in datas:
        urls = get_url(data)
        name = parse.unquote(data['sProdName'])
        dirpath = os.path.join('ablum',name)
        os.mkdir(dirpath)
        for index, image_url in enumerate(urls):
            request.urlretrieve(image_url, os.path.join(dirpath, '%d.jpg' % (index + 1)))
            print('%s第%d张图片下载完成!' %(name,index+1))
if __name__ == '__main__':
    main()

3、利用多线程爬取王者荣耀的图片


import requests
from urllib import parse
import os
from urllib import request
import threading
import queue

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'referer': 'https://pvp.qq.com/web201605/wallpaper.shtml'
}


class Producer(threading.Thread):
    def __init__(self, page_queue, image_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.image_queue = image_queue
    def run(self) -> None:

        while not self.page_queue.empty():

            page_url = self.page_queue.get()

            resp = requests.get(page_url, headers=headers)

            result = resp.json()
            datas = result['List']
            for data in datas:
                image_urls = extract_images(data)
                # FileNotFoundError: [WinError 3] 系统找不到指定的路径。: '1:1等身雕塑·铠'
                name = parse.unquote(data['sProdName']).replace('1:1', '').strip()
                dir_path = os.path.join('image', name)

                # 没有文件夹我才去创建
                if not os.path.exists(dir_path):
                    os.mkdir(dir_path)

                # 把图片的url放到队列当中
                for index, image_url in enumerate(image_urls):
                    self.image_queue.put(
                        {'image_url': image_url, 'image_path': os.path.join(dir_path, '%d.jpg' % (index + 1))})


# 定义消费者
class Consumer(threading.Thread):

    def __init__(self, image_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.image_queue = image_queue

    def run(self) -> None:

        while True:
            # 获取图片的url和下载路径
            try:
                image_obj = self.image_queue.get(timeout=10)
                image_url = image_obj.get('image_url')
                image_path = image_obj.get('image_path')

                try:
                    # 下载图片
                    request.urlretrieve(image_url, image_path)

                    print(image_path, '下载完成！')
                except:

                    print(image_path + '下载视频！')
            except:
                break


# 定义一个函数来获取每张图片的url
def extract_images(data):
    image_urls = []
    for x in range(1, 9):
        image_url = parse.unquote(data['sProdImgNo_%d' % x]).replace('200', '0')
        image_urls.append(image_url)

    return image_urls


def main():
    # 创建页数的队列
    page_queue = queue.Queue(18)
    # 创建图片的队列
    image_queue = queue.Queue(1000)

    for x in range(0, 18):
        page_url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={page}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1597325146548'.format(
            page=x)

        # 把页数的url添加到页数的队列当中
        page_queue.put(page_url)

    # 定义3个生产者线程
    for x in range(3):
        p = Producer(page_queue, image_queue)
        p.start()

    # 定义5个消费者线程
    for x in range(5):
        c = Consumer(image_queue)
        c.start()


if __name__ == '__main__':
    main()