python queue 多生产者,多消费者

项目需求是从快代理爬取ip,运用到项目中,设置队列的大小,当有ip被消耗时,就添加新的ip到队列中,获取ip的页面是不断刷新的。

总的来说,就是一方不断把爬取ip存到队列中,一方不断从队列中取ip,消耗ip,此处爬取的是快代理

 

首先创建一个队列

from queue import Queue

q =  queue.Queue()  # 括号里可填队列大小,maxsize=50

其次,创建一个获取ip通知锁

import threading
proxyLock = threading.Condition()

然后编写爬取ip代码,把它逐个加到队列中

queue.empty(), 非阻塞时,队列为空,取数据,报异常

queue._put(), 添加元素到队列中

# coding:utf-8
import requests, traceback
import threading, time, re

try:
    from queue import Queue, Empty
except:
    from Queue import Queue, Empty

# 设置全局变量
proxyQueue = Queue()  # 保存代理列表
proxyLock = threading.Condition()  # 代理获取通知锁

ProxySwitch = False
ProxyType = "http"
ProxyUrl = ""
ProxyMinIp = 5


def autoGetProxy():
    '''后台线程函数,等待通知,自动获取填充代理IP'''
    global proxyLock, proxyQueue
    print("autoGetProxy ready:", proxyLock.acquire())  # 先锁起来,然后等待解锁
    try:
        while ProxySwitch:  # 等待获取通知
            proxyLock.wait()
            print(u"开始填充代理IP,当前数量:%d" % proxyQueue.qsize())
            while True:  # 获取代理IP,直到获取成功
                try:
                    res = requests.get(ProxyUrl, timeout=15)
                    if res.status_code == 200:
                        need_add_white = re.search(u'请将(.*?)设置为', res.text)
                        if need_add_white:
                            need_add_white = need_add_white.group(1)
                            requests.get(
                                "http://web.http.cnapi.cc/index/index/save_white?neek=26979&appkey=b4b522a5e77521c95baa5e5a39fa7c07&white=" + need_add_white)
                        m = re.findall(r"data-title=\"IP\">(.*?)<", res.text)
                        if m:
                            proxyQueue.not_full.acquire()
                            # 批量写入到列队里面
                            try:
                                for i in m:
                                    proxyQueue._put(i)
                                    proxyQueue.unfinished_tasks += 1
                            finally:
                                proxyQueue.not_empty.notify()
                                proxyQueue.not_full.release()
                                # time.sleep(5) 测试用
                                proxyLock.notify()
                                print(u"填充代理IP列表成功,当前数量:%d" % proxyQueue.qsize())
                        else:
                            # print res.text
                            proxyLock.notify()
                            proxyLock.wait()
                            time.sleep(3)
                            continue
                        break
                except:
                    print (traceback.print_exc())
                proxyLock.notify()
                proxyLock.wait()
                # time.sleep(5)  # 如果发生异常,则延迟后 继续

    finally:
        proxyLock.release()


def GetProxy(
        ms=10,  # 设置等待时间,默认无限等待
        getstop=5  # 尝试获取次数,超过次数则放弃获取
):
    '''获取代理IP'''
    global proxyLock, proxyQueue, ProxyType
    proxyLock.acquire()
    try:
        for i in range(getstop):
            try:
                proxy_ip = proxyQueue.get_nowait()
                # 如果当前可用IP数低于min,则通知获取
                if proxyQueue.qsize() <= ProxyMinIp:
                    proxyLock.notify()
                print(u"获取代理IP成功,当前数量:%d" % proxyQueue.qsize())
                return proxy_ip
            except Empty:
                proxyLock.notify()
                proxyLock.wait(timeout=ms)
            except:
                print (traceback.print_exc())
    finally:
        proxyLock.release()


def startAutoGetProxy(url, proxy_type="https"):
    '''启动后台线程,自动填充代理IP池 '''
    global ProxyType, ProxyUrl, ProxySwitch
    ProxyType = proxy_type
    ProxyUrl = url
    print ("startAutoGetProxy", ProxySwitch)
    if not ProxySwitch:
        ProxySwitch = True
        threading.Thread(target=autoGetProxy).start()


def getProxyType():
    global ProxyType, ProxyUrl
    return ProxyType


def setProxyType(proxy_type):
    global ProxyType
    ProxyType = proxy_type


class ProxyPool():
    def __init__(self
                 , waitTime=10  # 设置等待时间,默认无限等待
                 , getTry=5  # 尝试获取次数,超过次数则放弃获取
                 ):
        self.waitTime = waitTime
        self.getTry = getTry

    def get(self):
        proxy_ip = GetProxy(self.waitTime, self.getTry)
        if proxy_ip:
            return {"http": r"{}://{}".format(ProxyType, proxy_ip), "https": r"{}://{}".format(ProxyType, proxy_ip)}


if __name__ == "__main__":
    startAutoGetProxy("https://www.kuaidaili.com/free/inha/")
    proxies = ProxyPool().get()

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值