项目需求是从快代理爬取ip,运用到项目中,设置队列的大小,当有ip被消耗时,就添加新的ip到队列中,获取ip的页面是不断刷新的。
总的来说,就是一方不断把爬取ip存到队列中,一方不断从队列中取ip,消耗ip,此处爬取的是快代理
首先创建一个队列
from queue import Queue
q = queue.Queue() # 括号里可填队列大小,maxsize=50
其次,创建一个获取ip通知锁
import threading
proxyLock = threading.Condition()
然后编写爬取ip代码,把它逐个加到队列中
queue.empty(), 非阻塞时,队列为空,取数据,报异常
queue._put(), 添加元素到队列中
# coding:utf-8
import requests, traceback
import threading, time, re
try:
from queue import Queue, Empty
except:
from Queue import Queue, Empty
# 设置全局变量
proxyQueue = Queue() # 保存代理列表
proxyLock = threading.Condition() # 代理获取通知锁
ProxySwitch = False
ProxyType = "http"
ProxyUrl = ""
ProxyMinIp = 5
def autoGetProxy():
'''后台线程函数,等待通知,自动获取填充代理IP'''
global proxyLock, proxyQueue
print("autoGetProxy ready:", proxyLock.acquire()) # 先锁起来,然后等待解锁
try:
while ProxySwitch: # 等待获取通知
proxyLock.wait()
print(u"开始填充代理IP,当前数量:%d" % proxyQueue.qsize())
while True: # 获取代理IP,直到获取成功
try:
res = requests.get(ProxyUrl, timeout=15)
if res.status_code == 200:
need_add_white = re.search(u'请将(.*?)设置为', res.text)
if need_add_white:
need_add_white = need_add_white.group(1)
requests.get(
"http://web.http.cnapi.cc/index/index/save_white?neek=26979&appkey=b4b522a5e77521c95baa5e5a39fa7c07&white=" + need_add_white)
m = re.findall(r"data-title=\"IP\">(.*?)<", res.text)
if m:
proxyQueue.not_full.acquire()
# 批量写入到列队里面
try:
for i in m:
proxyQueue._put(i)
proxyQueue.unfinished_tasks += 1
finally:
proxyQueue.not_empty.notify()
proxyQueue.not_full.release()
# time.sleep(5) 测试用
proxyLock.notify()
print(u"填充代理IP列表成功,当前数量:%d" % proxyQueue.qsize())
else:
# print res.text
proxyLock.notify()
proxyLock.wait()
time.sleep(3)
continue
break
except:
print (traceback.print_exc())
proxyLock.notify()
proxyLock.wait()
# time.sleep(5) # 如果发生异常,则延迟后 继续
finally:
proxyLock.release()
def GetProxy(
ms=10, # 设置等待时间,默认无限等待
getstop=5 # 尝试获取次数,超过次数则放弃获取
):
'''获取代理IP'''
global proxyLock, proxyQueue, ProxyType
proxyLock.acquire()
try:
for i in range(getstop):
try:
proxy_ip = proxyQueue.get_nowait()
# 如果当前可用IP数低于min,则通知获取
if proxyQueue.qsize() <= ProxyMinIp:
proxyLock.notify()
print(u"获取代理IP成功,当前数量:%d" % proxyQueue.qsize())
return proxy_ip
except Empty:
proxyLock.notify()
proxyLock.wait(timeout=ms)
except:
print (traceback.print_exc())
finally:
proxyLock.release()
def startAutoGetProxy(url, proxy_type="https"):
'''启动后台线程,自动填充代理IP池 '''
global ProxyType, ProxyUrl, ProxySwitch
ProxyType = proxy_type
ProxyUrl = url
print ("startAutoGetProxy", ProxySwitch)
if not ProxySwitch:
ProxySwitch = True
threading.Thread(target=autoGetProxy).start()
def getProxyType():
global ProxyType, ProxyUrl
return ProxyType
def setProxyType(proxy_type):
global ProxyType
ProxyType = proxy_type
class ProxyPool():
def __init__(self
, waitTime=10 # 设置等待时间,默认无限等待
, getTry=5 # 尝试获取次数,超过次数则放弃获取
):
self.waitTime = waitTime
self.getTry = getTry
def get(self):
proxy_ip = GetProxy(self.waitTime, self.getTry)
if proxy_ip:
return {"http": r"{}://{}".format(ProxyType, proxy_ip), "https": r"{}://{}".format(ProxyType, proxy_ip)}
if __name__ == "__main__":
startAutoGetProxy("https://www.kuaidaili.com/free/inha/")
proxies = ProxyPool().get()